# Automated ML

In [1]:
import logging
import os
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.data.dataset_factory import TabularDatasetFactory


## Dataset

### Overview
The dataset being used is the titanic dataset which is used in predicting whether a person survived the drowning of the ship based on different characteristics like age, gender etc.

## Setting up Workspace

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = "\n")

quick-starts-ws-136135
aml-quickstarts-136135
southcentralus
d4ad7261-832d-46b2-b093-22156001df5b


## Initialising Experiment

In [3]:
# choose a name for experiment
experiment_name = 'titanic_automl'
project_folder = './titanic_automl'
os.makedirs(project_folder, exist_ok = True)
experiment=Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
titanic_automl,quick-starts-ws-136135,Link to Azure Machine Learning studio,Link to Documentation


## Creating or checking for existing compute cluster

In [4]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

aml_compute_name = 'computecluster1' 
try:
    compute_target = ComputeTarget(workspace = ws, name = aml_compute_name)
    print("Existing cluster. Use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2", max_nodes = 4)
    compute_target = ComputeTarget.create(ws,aml_compute_name, compute_config)
compute_target.wait_for_completion(show_output = True)

Existing cluster. Use it.

Running


## Preparing Data

In [5]:
import pandas as pd
data = pd.read_csv('titanic_dataset.csv', header = 0)

In [6]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
def Impute_missing_values(data):
    data.drop(columns=['Cabin'],inplace=True)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna('S', inplace=True)
    data['Fare'].fillna(data['Fare'].median()) 
    return data

def Family_type(number):
    if number==0:
        return 'Alone'
    elif number>0 and number<=4:
        return 'Medium'
    else:
        return 'Large'

def Transform_data(data):
    data['Family_size']=data['Parch']+data['SibSp']
    data['Family_type']=data['Family_size'].apply(Family_type)
    data.drop(columns=['SibSp', 'Parch', 'Family_size'], inplace=True)
    data.loc[ data['Age'] <= 16, 'Age'] = 1
    data.loc[(data['Age'] > 16) & (data['Age'] <= 26), 'Age'] = 2
    data.loc[(data['Age'] > 26) & (data['Age'] <= 36), 'Age'] = 3
    data.loc[(data['Age'] > 36) & (data['Age'] <= 62), 'Age'] = 4
    data.loc[ data['Age'] > 62, 'Age'] = 5
    data.loc[data['Fare'] <= 17, 'Fare'] = 1,
    data.loc[(data['Fare'] > 17) & (data['Fare'] <= 30), 'Fare'] = 2,
    data.loc[(data['Fare'] > 30) & (data['Fare'] <= 100), 'Fare'] = 3,
    data.loc[ data['Fare'] > 100, 'Fare'] = 4
    return data

def clean_data(data):
    data = Impute_missing_values(data)
    data.head()
    x_data = Transform_data(data)
    x_data=pd.get_dummies(data=x_data, columns=['Age' ,'Fare',  'Pclass', 'Sex', 'Embarked', 'Family_type'], drop_first=True)
    x_data.drop(columns=['Ticket', 'PassengerId', 'Name','Age_5.0'],inplace=True)
    y_data = x_data.pop("Survived")
    return x_data, y_data

x, y = clean_data(data)
x.head()

#Split the dataset into train and test dataset. Combine x_train and y_train. 
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
data_train = pd.concat([x_train,y_train], axis=1)
data_test = pd.concat([x_test,y_test], axis=1)

#Convert x_train and y_train (Which are in pandas DataFrame format) to TabularDataset format.
try:
    os.makedirs('./data', exist_ok=True)
except OSError as error:
    print('New directory cannot be created')
    
path_train = 'data/train.csv'
path_test = 'data/test.csv'
data_train.to_csv(path_train)
data_test.to_csv(path_test)

datastore = ws.get_default_datastore()
datastore.upload(src_dir='data', target_path='data')

train_data = TabularDatasetFactory.from_delimited_files(path=[(datastore, ('data/train.csv'))])
test_data = TabularDatasetFactory.from_delimited_files(path=[(datastore, ('data/test.csv'))])
print("Successfully converted the dataset to TabularDataset format.")


Uploading an estimated of 2 files
Uploading data/test.csv
Uploaded data/test.csv, 1 files out of an estimated total of 2
Uploading data/train.csv
Uploaded data/train.csv, 2 files out of an estimated total of 2
Uploaded 2 files
Successfully converted the dataset to TabularDataset format.


## AutoML Configurations

In [9]:
from azureml.train.automl import AutoMLConfig

#Set parameters for AutoMLConfig
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=train_data,
    label_column_name="Survived",
    n_cross_validations=5,
    compute_target=compute_target)

## Run Details

The models trained are all different. You cannot determine how a model will perform until it is tried. Some models are more robust to violations than others.

In [10]:
from azureml.widgets import RunDetails
automl_run = experiment.submit(automl_config, show_output = True)
RunDetails(automl_run).show()

Running on remote.
No run_configuration provided, running on computecluster1 with default configuration
Running on remote compute: computecluster1
Parent Run ID: AutoML_025b4651-c12a-48f7-88a8-da072e26f839

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value impu

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [11]:
automl_run.wait_for_completion()

{'runId': 'AutoML_025b4651-c12a-48f7-88a8-da072e26f839',
 'target': 'computecluster1',
 'status': 'Completed',
 'startTimeUtc': '2021-01-26T15:27:35.114632Z',
 'endTimeUtc': '2021-01-26T16:06:22.804517Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'computecluster1',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"01b2b79f-1caa-46b1-996c-3e29f8f0d3ad\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/train.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-136135\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"d4ad7261-832d-46b2-b093-22156001df5b\\\\\\", \\\\\\"w

## Best Model

In [12]:
best_run_AutoML, best_model_AutoML = automl_run.get_output()
best_run_metrics_AutoML = best_run_AutoML.get_metrics()
print("Best run Id: ",best_run_AutoML.id)
print("Accuracy: ", best_run_metrics_AutoML['accuracy'])
print("Other details: ")
print("Fitted model:",best_model_AutoML)

Best run Id:  AutoML_025b4651-c12a-48f7-88a8-da072e26f839_21
Accuracy:  0.8217374175120653
Other details: 
Fitted model: Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                    min_samples_leaf=0.035789473684210524,
                                                                                                    min_samples_split=0.01,
        

In [13]:
import joblib
from azureml.core.model import Model

#Save the best model
os.makedirs('results', exist_ok=True)
joblib.dump(best_model_AutoML, filename="results/automl_model.pkl")
model = automl_run.register_model(model_name=best_run_AutoML.properties['model_name'], description='Best AutoML model')
print("Model saved successfully")

Model saved successfully
