# Automated ML

Import all the dependencies.

In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Dataset
from azureml.train.automl.utilities import get_primary_metrics
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep
from azureml.widgets import RunDetails
import os
import joblib

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'automl_hr_analytics'

experiment=Experiment(ws, experiment_name)

In [3]:
ws

Workspace.create(name='quick-starts-ws-138112', subscription_id='2c48c51c-bd47-40d4-abbe-fb8eabd19c8c', resource_group='aml-quickstarts-138112')

In [4]:
experiment

Name,Workspace,Report Page,Docs Page
automl_hr_analytics,quick-starts-ws-138112,Link to Azure Machine Learning studio,Link to Documentation


#Configuring the Compute Cluster

In [5]:
cpu_cluster_name = "compute-clust"
vm_size = "Standard_DS3_V2"
try:
    compute_target = ComputeTarget(workspace = ws, name= cpu_cluster_name)
    print("Found already existing")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found already existing

Running


## Dataset

### Overview
The dataset is taken from kaggle : "https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists".

The task is to predict whether the employee will leave the current job or not, based on the following factors-

1.  enrollee_id : Unique ID for candidate
2.  city: City code
3.  city_ development _index : Developement index of the city (scaled)
4.  gender: Gender of candidate
5.  relevent_experience: Relevant experience of candidate
6.  enrolled_university: Type of University course enrolled if any
7.  education_level: Education level of candidate
8.  major_discipline :Education major discipline of candidate
9.  experience: Candidate total experience in years
10. company_size: No of employees in current employer's company
11. company_type : Type of current employer
12. lastnewjob: Difference in years between previous job and current job
13. training_hours: training hours completed
14. target: 0 – Not looking for job change, 1 – Looking for a job change


In [6]:
dataset = Dataset.get_by_name(ws, name='data')
dataset.to_pandas_dataframe()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [7]:
dataset =dataset.drop_columns(["enrollee_id","city"])

## AutoML Configuration

1. The task is Binary classification, hence we use 'accuracy' as primary metric.
2. Cross validation of 6 folds is choosen, as it gave better accuracy than 3 or 4 fold.
3. Iterations are processed concurrently so as to speed up our training time.
4. Early stopping is enabled to prevent overfitting.
5. Experiment timeout is set to be 30 minutes.
6. Featurization parameter is set to be "auto" for auto feature scaling.

In [8]:
automl_settings = {
    "experiment_timeout_minutes": 30,
    "task": "classification", 
    "primary_metric": "accuracy",
    "training_data": dataset,
    "label_column_name": "target"    
}

automl_config = AutoMLConfig(
    n_cross_validations= 6,
    enable_early_stopping= True,
    compute_target= compute_target,
    max_cores_per_iteration= -1,
    max_concurrent_iterations= 4,
    featurization= "auto",
    
    **automl_settings
    )

In [9]:
#Submit the experiment
remote_run = experiment.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on compute-clust with default configuration
Running on remote compute: compute-clust
Parent Run ID: AutoML_14a5f20c-4824-4e9b-9277-358463cc13f6

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:        

## Run Details


In [10]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [11]:
remote_run.wait_for_completion(show_output=True)



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expected, let the run complete. Otherwise cancel the current run and use a script to customize the handling of missing feature values that may be more appropriate based on the data type and business requirement.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization
DETAILS:      
+---------------------------------+---------------------------------+
|Column name                   

{'runId': 'AutoML_14a5f20c-4824-4e9b-9277-358463cc13f6',
 'target': 'compute-clust',
 'status': 'Completed',
 'startTimeUtc': '2021-02-08T06:13:13.753831Z',
 'endTimeUtc': '2021-02-08T06:33:13.104987Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '6',
  'target': 'compute-clust',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"f925630c-8053-426f-99b5-0c0757ff562d\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/02-08-2021_055728_UTC/data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-138112\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"2c48c51c-bd47-40d4-abbe-fb8eabd19c8c\\

## Best Model




In [12]:
automl_best_run, model= remote_run.get_output()
automl_best_run

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


Experiment,Id,Type,Status,Details Page,Docs Page
automl_hr_analytics,AutoML_14a5f20c-4824-4e9b-9277-358463cc13f6_36,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [13]:
metrics = automl_best_run.get_metrics()
parameters = automl_best_run.get_details()

print("Best run ID: ",automl_best_run.id)
print("Accuracy: ",metrics["accuracy"])

Best run ID:  AutoML_14a5f20c-4824-4e9b-9277-358463cc13f6_36
Accuracy:  0.8017016390019834


In [14]:
model._final_estimator

PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('0',
                                           Pipeline(memory=None,
                                                    steps=[('maxabsscaler',
                                                            MaxAbsScaler(copy=True)),
                                                           ('lightgbmclassifier',
                                                            LightGBMClassifier(boosting_type='gbdt',
                                                                               class_weight=None,
                                                                               colsample_bytree=1.0,
                                                                               importance_type='split',
                                                                               learning_rate=0.1,
                                                                               max_

In [15]:
# Save the best model
joblib.dump(model, filename="outputs/automl_model.joblib")

['outputs/automl_model.joblib']

In [16]:
model_name = automl_best_run.properties['model_name']
model_name

'AutoML14a5f20c436'

## Model Deployment



In [17]:
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice.aci import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model

In [18]:
env = automl_best_run.get_environment().save_to_directory(path='environments')

script_file= 'score.py'

automl_best_run.download_file('outputs/scoring_file_v_1_0_0.py', script_file)

In [19]:
model = remote_run.register_model(model_name = model_name, description = 'AutoML best model')

model.id

'AutoML14a5f20c436:1'

# Setting Inference Config and ACI Config

In [20]:
inference_config= InferenceConfig(entry_script = script_file, environment = env)
aci_config= AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

In [21]:
service = Model.deploy(ws, 'analytics-api', [model], inference_config, aci_config)
service.wait_for_deployment(True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.............................................
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [22]:
print("State: " + service.state)
print("Scoring URI: " + service.scoring_uri)

State: Healthy
Scoring URI: http://df7346c6-e47f-45e9-ada9-a40aeee9e464.southcentralus.azurecontainer.io/score


#Testing the Endpoint

In [29]:
%run endpoint.py

{"result": [0.0, 0.0]}


# Logs of webservice

In [27]:
logs = service.get_logs()

for line in logs.split('\n'):
    print(line)

2021-02-08T06:51:22,089234993+00:00 - iot-server/run 
2021-02-08T06:51:22,091179316+00:00 - gunicorn/run 
2021-02-08T06:51:22,092843635+00:00 - rsyslog/run 
2021-02-08T06:51:22,099531612+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

In [30]:
service.delete()