In [4]:
from azureml.core import Workspace, Experiment, Run, ScriptRunConfig
from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails
import os
import sys

In [5]:
script_dir = 'playground'
os.makedirs(script_dir, exist_ok=True)


In [6]:
%%writefile $script_dir/data_prep.py
import os
import argparse
from azureml.core import Run
import pandas as pd
from sklearn.processing import MinMaxScaler

parser = argparse.ArgumentParser()
parser.add_argument('--input_data', type=str, dest='raw_dataset', help='Raw dataset')
parser.add_argument('--preped_data', type=str, dest='preped_dataset', help='Prepared dataset')
args = parser.parse_args()

saved_dir = args.preped_data
run = Run.get_context()

df = run.input_datasets['raw_dataset'].to_pandas_dataframe()

num_features = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure',
                'TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
scaler = MinMaxScaler()
df[num_features] = scaler.fit(df[num_features])

print('saving data to preped_data')
os.makedirs(saved_dir, exist_ok=True)
df.to_csv(os.path.join(saved_dir, 'preped_data.csv'), index=False)
run.complete()

Writing playground/data_prep.py


In [7]:
%%writefile $script_dir/train.py

import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import argparse
import numpy as np
from azureml.core import Run, Model
import joblib

parser = argparse.ArgumentParser()
parser.add_argument('--input_data', type=str, dest='input_data', help='Input dataset for model')
args = parser.parse_args()

run = Run.get_context()
df = run.input_datsets['input_data'].to_pandas_dataframe()

X_train, y_train, X_test, y_test = train_test_split(df, test_size=.3, random_state=0)

model = DecisionTreeClassifier().fit(X_train, y_train)

y_pred = model.predict(X_test)

# ACC
acc = np.average(y_pred==y_test)
run.log(name='acc', value=acc)

# AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_scores[:, 1])
run.log(name='auc', value=auc)

# ROC
fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])
fig, ax = plt.subplots(figsize=(6, 6))
ax.plot([0,1], [0,1], 'k--')
ax.plot(fpr, tpr)
ax.title('ROC curve')
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
run.log_image(name='roc_curve', plot=fig)

print('Saving model')
os.makedirs('models', exist_ok=True)

model_file_path = os.path.join('models', 'diabetes_model.pkl')
joblib.dump(model, model_file_path)

Model.register(workspace=ws,
              model_path=model_file,
              tags={"training context": "pipeline"},
              properties={'AUC': auc, "ACC": acc},
              model_Path=model_file_path)
run.complete()


Writing playground/train.py


In [9]:
ws = Workspace.from_config()
datastore = ws.get_default_datastore()


If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [14]:
for data in ws.datasets:
    print(data)


batch-data
diabetes file dataset
diabetes dataset
loan_data


In [13]:
'diabetes dataset' in ws.datasets

True

In [26]:
from glob import glob
# [os.path.join(root, f) for f in files for root, dirs, files in os.walk('data')]
glob('data/*.csv')

['data\\diabetes.csv', 'data\\diabetes2.csv']

In [29]:
# register dataset
from azureml.core import Dataset
from azureml.data.datapath import DataPath

dataset_name = 'diabetes_dataset2'

if dataset_name not in ws.datasets:
    print("We will upload the files to cloud datastore, retrive it and register it")
    Dataset.File.upload_directory(src_dir='data', target=DataPath(datastore, dataset_name)) 
    tab_data = Dataset.Tabular.from_delimited_files(path=(datastore, os.path.join(dataset_name, '*.csv')))
    try:
        tab_data = tab_data.register(workspace=ws,
                                    name=dataset_name,
                                    tags={"format": "csv"},
                                    description="diabetes dataset",
                                    create_new_version=True)
    except Exception as ex:
        print(ex)
else:
    print("dataset already exist")
    


We will upload the files to cloud datastore, retrive it and register it
Validating arguments.
Arguments validated.
Uploading file to diabetes_dataset2
Uploading an estimated of 2 files
Target already exists. Skipping upload for diabetes_dataset2\diabetes.csv
Target already exists. Skipping upload for diabetes_dataset2\diabetes2.csv
Uploaded 0 files
Creating new dataset


In [30]:
tab_data

{
  "source": [
    "('workspaceblobstore', 'diabetes_dataset2\\*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "6227d552-254c-4db6-a0df-4f4db3249975",
    "name": "diabetes_dataset2",
    "version": 1,
    "description": "diabetes dataset",
    "tags": {
      "format": "csv"
    },
    "workspace": "Workspace.create(name='myworkspace', subscription_id='efaef50b-3a01-4bf1-ad06-b63c101ab300', resource_group='resource-group-1')"
  }
}

In [None]:
tab_data.to_pandas_dataframe().head(10)

In [None]:
%%writefile $script_dir/data_prep.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(diabetes))
run.log('raw_rows', row_count)

# remove nulls
diabetes = diabetes.dropna()

# Normalize the numeric columns
scaler = MinMaxScaler()
num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
diabetes[num_cols] = scaler.fit_transform(diabetes[num_cols])

# Log processed rows
row_count = len(diabetes)
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
diabetes.to_csv(save_path, index=False, header=True)

# End the run
run.complete()