## Automating Model Training using AzurelML SDK

### Workpsace Setup

In [None]:
# Import azureml librarie and the Workspace class
import azureml.core
from azureml.core import Workspace

# Creating the wrokplace
ws = Workspace.create(name="Azureml-SDK-WS01",
                     subscription_id="3dddd7d3-09f0-464e-8095-599d52d06724",
                     resource_group="AzuremL-SDK-RG01",
                     create_resource_group="True",
                     location="francecentral")

# Create the config.json file to local directory
ws.write_config(path="./config")

In [76]:
# ----------------------------------------------------
# This is the Job Script/Run Configuration script for 
# bulding a pipeline and running it in an experiment
# ----------------------------------------------------

# Access the Workspace
from azureml.core import Workspace

ws = Workspace.from_config("./config")


# ----------------------------------------------------------------------
# Create custom environment
from azureml.core import Environment
from azureml.core.environment import CondaDependencies

# Create the environmemnt
myenv = Environment(name="MyEnvironment")

# Create the dependencies object
myenv_dep = CondaDependencies.create(conda_packages=["scikit-learn", "pandas", "numpy"])

myenv.python.conda_dependencies = myenv_dep

# Register the environment
myenv.register(ws)
# ----------------------------------------------------------------------

{
    "assetId": "azureml://locations/francecentral/workspaces/23fdee4b-b3e2-410a-b340-b79068d18b40/environments/MyEnvironment/versions/3",
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20230120.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "buildContext": null,
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "MyEnvironment",
 

### Provisionning the Cluster

In [23]:
# ------------------------------------------------------------------------------
# Create a compute cluster using AzureML SDK 
# ------------------------------------------------------------------------------
cluster_name = "pipeline-cluster"


# Provisioning and configuration of the compute cluster using AmlCompute
from azureml.core.compute import AmlCompute
compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D11_V2",
                                                      vm_priority='dedicated',
                                                      min_nodes=0,
                                                      max_nodes=2)

# Create the cluster 
#AmlCompute.create(ws, cluster_name, compute_config)
from azureml.core.compute import ComputeTarget
compute_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
compute_cluster.wait_for_completion()
# ------------------------------------------------------------------------------

Provisioning operation finished, operation "Succeeded"


### Configuration step

In [24]:
# ------------------------------------------------------------------------------
# Create Run Configuration for the steps
# ------------------------------------------------------------------------------

from azureml.core.runconfig import RunConfiguration
run_config = RunConfiguration()

run_config.target = compute_cluster
run_config.environment = myenv

###  Accessing Datastore and Datasets using SDK

In [9]:
# List datasets from a workplace

ds_list = list(ws.datasets.keys())
ds_list

['Loan Dataset From Dataframe', 'Loan Applications using SDK']

In [15]:
# Import required classes from Azureml
from azureml.core import Workspace, Datastore, Dataset

# Access the Worspace, Datastore and Datasets
az_store = Datastore.get(ws, "azure_sdk_blob01")
az_dataset = Dataset.get_by_name(ws, "Loan Applications Using SDK")
az_default_store = ws.get_default_datastore()

# Load the Azureml Dataset into the pandas dataframe
df = az_dataset.to_pandas_dataframe()
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,False,0,Graduate,False,5849,0.0,,360.0,1.0,Urban,True
1,LP001003,Male,True,1,Graduate,False,4583,1508.0,128.0,360.0,1.0,Rural,False
2,LP001005,Male,True,0,Graduate,True,3000,0.0,66.0,360.0,1.0,Urban,True
3,LP001006,Male,True,0,Not Graduate,False,2583,2358.0,120.0,360.0,1.0,Urban,True
4,LP001008,Male,False,0,Graduate,False,6000,0.0,141.0,360.0,1.0,Urban,True


In [42]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [49]:
df.select_dtypes(include='number').columns

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [50]:
df.select_dtypes(include='object').columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

### Logistion Regresion Model 

In [55]:
# ------------------------------------------------------------------------------
# Perform Logistic regression on our datasets 
# ------------------------------------------------------------------------------

import pandas as pd 

# Load the data from the local files
df = pd.read_csv("./data/loan.csv")

# Select the relevant columns from the dataset
dataPrep = df.drop(["Loan_ID"], axis=1)

# Check the missing values
dataNull = dataPrep.isnull().sum()

# Replace the missing values of string variable with mode
mode = dataPrep.mode().iloc[0]
cat_cols = dataPrep.select_dtypes(include='object').columns
dataPrep[cat_cols] = dataPrep[cat_cols].fillna(mode)

# Replace numerical columns with mean
num_cols = dataPrep.select_dtypes(include=['int', 'float']).columns
mean = dataPrep[num_cols].mean()
dataPrep[num_cols] = dataPrep[num_cols].fillna(mean)

# Create Dummy variables - Not Required in designer/classic studio
dataPrep = pd.get_dummies(dataPrep, drop_first=True)
#dataPrep

# Normalise the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
columns = df.select_dtypes(include='number').columns

dataPrep[columns] = scaler.fit_transform(dataPrep[columns])
#dataPrep.head()

# Create X and y - Similar to "edit columns" in Train module
y = dataPrep["Loan_Status_Y"]
X = dataPrep.drop(columns=["Loan_Status_Y"], axis=1)

# Spit Data - X and y datasets are training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, test_size = 0.3, random_state = 1234, stratify=y)


# Build the Logistic Regression model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()


# Fit the data to the LogisticRegression object - Train Model
lr.fit(X_train, y_train)


# Predict the outcome using Test data - Score Model 
# Scored Label
y_predict = lr.predict(X_test)

# Get the probability score - Scored Probabilities
y_prob = lr.predict_proba(X_test)[:, 1]

# Get Confusion matrix and the accuracy/score - Evaluate
from sklearn.metrics import confusion_matrix
cm    = confusion_matrix(y_test, y_predict)
score = lr.score(X_test, y_test)

In [65]:
score, cm, y_predict[:5], y_prob[:5].round(3)*100

(0.7621621621621621,
 array([[ 18,  40],
        [  4, 123]]),
 array([1, 0, 1, 1, 1], dtype=uint8),
 array([83. ,  7.5, 61.3, 90. , 63.8]))

In [68]:
cm_dict = {
            "schema_type": "confusion_matrix",
            "schema_version": "v1",
            "data": {"class_Labels": ["N", "Y"],
                     "matrix": cm.tolist()}
          }
cm_dict

{'schema_type': 'confusion_matrix',
 'schema_version': 'v1',
 'data': {'class_Labels': ['N', 'Y'], 'matrix': [[18, 40], [4, 123]]}}

In [70]:
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


y_prob_df = pd.DataFrame(y_prob, columns=["Scored Probabilities"])
y_predict_df = pd.DataFrame(y_predict, columns=["Scored Label"])

scored_dataset = pd.concat([X_test, y_test, y_predict_df, y_prob_df],
                           axis=1)
                        
# Upload the scored dataset
#scored_dataset.to_csv("./outputs/defaults_scored.csv", index=False)
scored_dataset

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y,Scored Label,Scored Probabilities
0,0.027767,0.000000,0.198860,0.74359,1.000000,1,1,1,0,0,0,1,1,0,1,1,0.829767
1,0.059307,0.000000,0.198860,0.74359,0.000000,1,1,1,0,0,0,0,0,0,0,0,0.074849
2,0.072356,0.000000,0.218524,0.74359,0.842199,1,1,1,0,0,0,0,0,0,1,1,0.612756
3,0.028114,0.012120,0.175109,0.74359,1.000000,0,1,0,0,0,0,0,1,0,1,1,0.900450
4,0.025838,0.060576,0.172214,0.74359,1.000000,1,1,1,0,0,1,0,0,1,1,1,0.638145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.037390,0.072503,0.185239,0.74359,1.000000,1,1,0,0,0,0,0,0,1,1,1,0.784323
181,0.043810,0.000000,0.121563,0.74359,0.842199,0,0,0,0,0,0,0,0,0,1,1,0.644447
182,0.055238,0.000000,0.180897,0.74359,1.000000,1,1,0,1,0,0,0,0,1,0,1,0.865626
183,0.222820,0.000000,0.167873,0.74359,1.000000,0,0,0,0,0,1,1,0,1,1,1,0.639892


### Define Pipeline Steps 

In [95]:
# Import libraries
from azureml.core import Workspace, Datastore, Dataset, Experiment
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import StepSequence


# Import azureml librarie and the Workspace class
import azureml.core
from azureml.core import Workspace

# Creating the wrokplace
ws = Workspace.create(name="Azureml-SDK-WS03",
                     subscription_id="3dddd7d3-09f0-464e-8095-599d52d06724",
                     resource_group="AzuremL-SDK-RG01",
                     create_resource_group="True",
                     location="francecentral")

#Create the config.json file to local directory
ws.write_config(path="./config")



# ----------------------------------------------------------------------
# Create custom environment
from azureml.core import Environment
from azureml.core.environment import CondaDependencies

# Create the environmemnt
myenv = Environment(name="MyEnvironment")

# Create the dependencies object
myenv_dep = CondaDependencies.create(conda_packages=["scikit-learn", "pandas", "numpy"])

myenv.python.conda_dependencies = myenv_dep

# Register the environment
myenv.register(ws)
# ----------------------------------------------------------------------


# Access the Worspace, Datastore and Datasets
ws = Workspace.from_config("./config")
az_datastore = Datastore.get(ws, "azure_sdk_blob01")
az_dataset = Dataset.get_by_name(ws, "Loan Applications Using SDK")



# Define Pipeline steps
#input_ds = ws.datasets.get("Defaults")
#dataFolder = PipelineData('datafolder', datastore=ws.get_default_datastore)
input_ds = az_dataset.as_named_input('raw_data')
dataFolder = PipelineData('datafolder', datastore=az_datastore)




# Step 01 - Data Preparation
dataPrep_step = PythonScriptStep(name="01 - Data Preparation",
                                source_directory='.',
                                script_name="220 - Dataprep Pipeline.py",
                                #inputs=[input_ds.as_named_input('raw_data')],
                                inputs=[input_ds],
                                outputs=[dataFolder],
                                runconfig=run_config,
                                arguments=['--datafolder', dataFolder])


# Step 02 - Training the model 
train_step = PythonScriptStep(name="02 - Train the Model",
                                source_directory='.',
                                script_name="220 - Training Pipeline.py",
                                inputs=[dataFolder],
                                runconfig=run_config,
                                arguments=['--datafolder', dataFolder])


# Step 03 - Configure and build the pipeline
steps = [dataPrep_step, train_step]



from azureml.pipeline.core import Pipeline
new_pipeline = Pipeline(workspace=ws, steps=steps)



# Step 04 - Create the experiment and run the pipeline
from azureml.core import Experiment

new_experiment = Experiment(workspace=ws, name="PipelineExp01")
new_pipeline_run = new_experiment.submit(new_pipeline)

new_pipeline_run.wait_for_completion(show_output=True)


Deploying AppInsights with name azuremlsinsights48714371.
Deployed AppInsights with name azuremlsinsights48714371. Took 3.79 seconds.
Deploying StorageAccount with name azuremlsstorage26874a9d2.
Deploying KeyVault with name azuremlskeyvault6e22f3d3.
Deployed KeyVault with name azuremlskeyvault6e22f3d3. Took 18.26 seconds.
Deploying Workspace with name Azureml-SDK-WS03.
Deployed StorageAccount with name azuremlsstorage26874a9d2. Took 21.38 seconds.
Deployed Workspace with name Azureml-SDK-WS03. Took 18.7 seconds.


UserErrorException: UserErrorException:
	Message: (UserError) Could not find datastore: azure_sdk_blob01.
	InnerException None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "(UserError) Could not find datastore: azure_sdk_blob01."
    }
}

In [None]:

# Step 00 - Environement Setup

# create a PipelineData object to hold the installed package
installed_packages = PipelineData(name='installed_packages', datastore=az_datastore)


# install Pandas module
import subprocess
subprocess.call(['pip', 'install', 'pandas', '-t', './outputs'])


# create a PythonScriptStep to install the Pandas module
install_step = PythonScriptStep(name='Install_Pandas',
                                script_name='install_pandas.py',
                                arguments=['--output', installed_packages],
                                inputs=[],
                                outputs=[installed_packages],
                                compute_target=compute_cluster)


# create a StepSequence to run the install step before the dataprep_step
step_sequence = StepSequence(steps=[install_step])
