# Training with custom scripts

This notebook is meant for the scenario where JMC's data science team might bring their own scripts for training, potentially their own custom libraries. In this case, we will use scikit learn to simulate a custom library that might be used.

## Setting up AzureML Experiment

In [1]:
import azureml.core
from azureml.core import Workspace
from azureml.core import Experiment
import os
import shutil

ws = Workspace.from_config()
exp_name = 'c4ts-customlib'
exp = Experiment(workspace = ws, name = exp_name)

project_folder = './{}'.format(exp_name)
os.makedirs(project_folder, exist_ok=True)
shutil.copy('AssetData_Historical.csv', project_folder)

Found the config file in: C:\Users\zhpek\Desktop\C4TS-Challenge4\aml_config\config.json


'./c4ts-customlib\\AssetData_Historical.csv'

## Custom Dataprep scripts

We make the same steps we do with the AutoML use case. In this case, we make a $\texttt{prepare}$ function to simplify and clean up the training script.

In [2]:
%%writefile ./c4ts-customlib/utils.py

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA


def cleanLongLat(l):
    split = l.str.split(',', expand=True)
    split = (split[0]+'.'+[i if len(i)>1 else i+'0' for i in split[1]]).astype(float)
    return(split)
    
class scaler:
    def __init__(self, x = None):
        if type(x) == pd.core.frame.DataFrame:
            self.fit(x)
        elif x == None:
            self.x = None
            self.mean = None
            self.var = None
        else:
            raise Exception('Require pandas.DF input')


    def fit(self, x):
        self.x = x
        self.mean = x.mean()
        self.var = x.var()

    def scale(self, new_x):
        result = (new_x - self.mean) / np.sqrt(self.var)
        return (result)
    
def prepare(X, fit = False, scaler_obj = None, pca_obj = None):
    
    if fit:
        s = scaler(X)
        pca = PCA()
        X_ = s.scale(X)
        X_ = pca.fit_transform(X_)
        X_ = X_[:,:10]
        
        return(X_, s, pca)
    else:
        if scaler_obj == None or pca_obj == None:
            raise Exception('Non fitting requires scaler/pca obj')
        X_ = pca_obj.transform(scaler_obj.scale(X))[:,:10]
        return X_

Overwriting ./c4ts-customlib/utils.py


## Training Script

We use scikit learn's GB Classifier as a quick example with grid search using 5-fold CV over the number of trees and learning rate.

In [13]:
%%writefile ./c4ts-customlib/train.py

import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from azureml.core.run import Run
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier

from utils import *
    
os.makedirs('./outputs', exist_ok=True)

# Data Preparation
df = pd.read_csv('AssetData_Historical.csv')
df.drop(['Machine_ID', 'District'], axis=1, inplace=True)
df['Latitude'] = cleanLongLat(df['Latitude'])
df['Longitude'] = cleanLongLat(df['Longitude'])
X = df.drop('Failure_NextHour', 1)
y = df['Failure_NextHour']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, stratify = y)

X_prep, s, pca= prepare(X_train, fit = True)

run = Run.get_context()

param_grid = {'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
              'n_estimators': [100, 200, 300, 400, 500]
}

model = GradientBoostingClassifier(loss = 'exponential')
kf = StratifiedKFold(n_splits = 5, shuffle = True)
gridsearch = GridSearchCV(model, param_grid, 
                          scoring = 'f1_weighted',
                          n_jobs = -1,
                          cv = kf)
weights = y_train * 3 + 1
result = gridsearch.fit(X_prep, y_train, sample_weight = weights)

run.log('bestScore', result.best_score_)
run.log('bestParam', result.best_params_)
run.log('valMean', result.cv_results_['mean_test_score'])
run.log('valStd', result.cv_results_['std_test_score'])
run.log('valParams', result.cv_results_['params'])
run.log('FeatureImportance', result.feature_importances_)

#################
#Fit Final Model#
#################

X, s, pca = prepare(X, fit = True)
best_model = result.estimator.fit(X, y, sample_weight = y*4 + 1)

pickle.dump(s, open('./outputs/scaler.pkl', 'wb'))
pickle.dump(pca,open('./outputs/pca_transform.pkl','wb'))

import time
model_name = 'GBT_{}'.format(time.time())
with open(model_name, 'wb') as f:
    joblib.dump(value = best_model, filename = './outputs/{}.pkl'.format(model_name))

Overwriting ./c4ts-customlib/train.py


In [14]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.compute import ComputeTarget#, AmlCompute

cpu_cluster = ComputeTarget(workspace=ws, name= "pekamlcompute")
run_config = RunConfiguration(framework="python")
run_config.target = cpu_cluster.name
run_config.environment.docker.enabled = True
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn', 'pandas','numpy'])

from azureml.core import Run
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=project_folder, 
                      script='train.py', 
                      run_config=run_config) 
run = exp.submit(config=src)
run

Experiment,Id,Type,Status,Details Page,Docs Page
c4ts-customlib,c4ts-customlib_1548125115512,azureml.scriptrun,Queued,Link to Azure Portal,Link to Documentation


In [None]:
%%time
# Shows output of the run on stdout.
run.wait_for_completion(show_output=True)

In [9]:
run.get_metrics()

{}