# Automated ML

# Initialization

In [1]:
from azureml.core import Experiment, Workspace

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'capstone'

experiment=Experiment(ws, experiment_name)

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


# Dataset

Zindi: DSN AI Bootcamp Qualification Hackathon [data](https://zindi.africa/hackathons/dsn-ai-bootcamp-qualification-hackathon/data)

In [3]:
from azureml.core import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

from utils import get_data
from scripts.cleaning import clean_data

In [4]:
path = "data/Train.csv"

try:
    loan_dataset = Dataset.get_by_name(ws, name="loan_dataset")
except:
    # Download the dataset and upload to datastore
    _ = get_data(path) 
    datastore = ws.get_default_datastore()
    datastore.upload('data', target_path='data')

    # Create TabularDataset & register in workspace
    loan_dataset = Dataset.Tabular.from_delimited_files([(datastore, (path))])
    loan_dataset = loan_ds.register(
        ws, name="loan_dataset", create_new_version=True,
        description="Dataset for Udacity Machine Learning with Azure Capstone Project"
    )
    finally:
        loan_dataset = loan_dataset.to_pandas_dataframe()

In [5]:
clean_loan_dataset = clean_data(loan_dataset, threshold=0.6, dropped_columns=["Applicant_ID"])

# Stratified train_test_split because dataset is imbalanced
train, test = train_test_split(
    clean_loan_dataset,
    test_size=0.3,
    stratify=clean_loan_dataset.default_status,
    random_state=42
)
train.head()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
2800,3398.0,1.19505,1.7028,0.5238,0.0,18672.0,5150360.0,20617.0,,5189260.0,...,0.0,0.43043,4.04,0.683232,0.0,1,15.434027,0.739973,,0
20577,3124.0,2.40405,5.1528,0.0,0.0,,,,,0.0,...,0.0,1.026663,0.0,0.555328,0.0,1,,0.0,,1
42690,3510.0,0.0238,0.0908,0.0,0.0,,45740954.0,,8866477.0,60476663.0,...,1.0,0.057893,5.05,0.369424,0.0,1,,0.0,0.255556,0
14918,,,0.1646,0.0,0.0,,,,,0.0,...,,1.32,0.0,,,0,,0.0,,1
5298,3512.0,0.06575,0.72,0.0,0.0,0.0,1025793.0,35788.0,2226636.0,1761392.0,...,0.0,0.22,5.05,0.519776,0.0,0,144.54699,1.744186,0.128355,1


# AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [6]:
import logging

from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails

In [7]:
automl_settings = {
    "featurization": "auto",
    "n_cross_validations": 4,
    "experiment_timeout_minutes": 30,
    "enable_early_stopping": True,
    "verbosity": logging.INFO,
} 

automl_config = AutoMLConfig(
    task="classification",
    training_data=train,
    label_column_name="default_status",
    primary_metric="AUC_weighted",
    **automl_settings
)

# Experiment Submission

In [8]:
remote_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_4ae69141-81be-460f-a7db-0020fff16339

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing f

# Run Details

In [9]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

# Best Model

In [10]:
from azureml.core.model import Model
import joblib

from utils import print_model

In [11]:
automl_run, best_automl_model = remote_run.get_output()

In [12]:
print(automl_run)

Run(Experiment: capstone,
Id: AutoML_4ae69141-81be-460f-a7db-0020fff16339_22,
Type: None,
Status: Completed)


In [13]:
print_model(best_automl_model)

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['0', '16', '2', '18', '17', '4', '7', '14'],
 'weights': [0.4666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.13333333333333333,
             0.06666666666666667,
             0.06666666666666667]}

0 - maxabsscaler
{'copy': True}

0 - lightgbmclassifier
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jo

In [14]:
joblib.dump(best_automl_model, "outputs/automl_model.joblib")

In [15]:
model = Model.register(
    workspace=ws,
    model_path="outputs/automl_model.joblib",
    model_name="AutoML_Voting_Ensemble",
    tags={"auc": 0.8342},
    description="default_status prediction model"
)

Registering model AutoML_Voting_Ensemble
