# 1. Connect to your workspace

In [1]:
import azureml.core
from azureml.core import Workspace

from azureml.core import Workspace, Datastore, Dataset
from azureml.data.datapath import DataPath

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with session15workspace


In [2]:
from azureml.core import Experiment
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import numpy as np
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [4]:
import joblib
import os

In [5]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://docs.microsoft.com/en-us/azure/machine-learning/media/concept-data/data-concept-diagram.svg")


# 2. Work with datastores
### View datastores

In [6]:
# Get the default datastore
default_ds = ws.get_default_datastore()

# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, "- Default =", ds_name == default_ds.name)

workspaceartifactstore - Default = False
workspacefilestore - Default = False
workspaceworkingdirectory - Default = False
workspaceblobstore - Default = True


### Upload data to a datastore

In [7]:
import pandas as pd
import os

In [8]:
## we dowload some files from the internet
diabetes1 = pd.read_csv('https://raw.githubusercontent.com/MicrosoftLearning/mslearn-dp100/main/data/diabetes.csv')
diabetes2 = pd.read_csv('https://raw.githubusercontent.com/MicrosoftLearning/mslearn-dp100/main/data/diabetes2.csv')

In [10]:
## we create a dir
os.mkdir('./diabetes')

FileExistsError: [Errno 17] File exists: './diabetes'

In [12]:
## we store the files locally


diabetes1.to_csv('./diabetes/diabetes1.csv')
diabetes2.to_csv('./diabetes/diabetes2.csv')

In [13]:
datastore = Datastore.get(ws, 'workspaceblobstore')

In [14]:
ds = Dataset.File.upload_directory(src_dir='./diabetes/',
           target=DataPath(datastore,  '/diabetes'),
           show_progress=True)

Validating arguments.
Arguments validated.
Uploading file to /diabetes
Uploading an estimated of 2 files
Target already exists. Skipping upload for diabetes/diabetes1.csv
Target already exists. Skipping upload for diabetes/diabetes2.csv
Uploaded 0 files
Creating new dataset


# 3. Work with datasets
### Create a tabular dataset

In [15]:
# Get the default datastore
default_ds = ws.get_default_datastore()

#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes/*.csv'))

# Display the dataset as a Pandas dataframe
tab_data_set.to_pandas_dataframe()

Unnamed: 0,Column1,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,3,1883350,9,103,78,25,304,29.582192,1.282870,43,1
4,4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
...,...,...,...,...,...,...,...,...,...,...,...
14995,4995,1490300,10,65,60,46,177,33.512468,0.148327,41,1
14996,4996,1744410,2,73,66,27,168,30.132636,0.862252,38,1
14997,4997,1742742,0,93,89,43,57,18.690683,0.427049,24,0
14998,4998,1099353,0,132,98,18,161,19.791645,0.302257,23,0


### Create a file Dataset

In [16]:
#Create a file dataset from the path on the datastore (this may take a short while)
file_data_set = Dataset.File.from_files(path=(default_ds, 'diabetes/*.csv'))

# Get the files in the dataset
for file_path in file_data_set.to_path():
    print(file_path)

/diabetes1.csv
/diabetes2.csv


### Register datasets

In [17]:
# Register the tabular dataset
try:
    tab_data_set = tab_data_set.register(workspace=ws, 
                                        name='diabetes tabular dataset',
                                        description='diabetes tabular data',
                                        tags = {'format':'CSV'},
                                        create_new_version=True)
except Exception as ex:
    print(ex)

# Register the file dataset
try:
    file_data_set = file_data_set.register(workspace=ws,
                                            name='diabetes file dataset',
                                            description='diabetes files',
                                            tags = {'format':'CSV'},
                                            create_new_version=True)
except Exception as ex:
    print(ex)

print('Datasets registered')

Datasets registered


### Explore  and request datasets

In [18]:
print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name, 'version', dataset.version)

Datasets:
	 diabetes tabular dataset version 1
	 diabetes file dataset version 1
	 diabetes dataset version 1
	 superdataset version 1


In [19]:
request = Dataset.get_by_name(ws, name='diabetes tabular dataset')
request.to_pandas_dataframe()

Unnamed: 0,Column1,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,3,1883350,9,103,78,25,304,29.582192,1.282870,43,1
4,4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
...,...,...,...,...,...,...,...,...,...,...,...
14995,4995,1490300,10,65,60,46,177,33.512468,0.148327,41,1
14996,4996,1744410,2,73,66,27,168,30.132636,0.862252,38,1
14997,4997,1742742,0,93,89,43,57,18.690683,0.427049,24,0
14998,4998,1099353,0,132,98,18,161,19.791645,0.302257,23,0


# 4. Train a model from a tabular dataset

In [20]:
# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=ws, name="session16Experiment")

In [21]:
# Start logging data from the experiment, obtaining a reference to the experiment run
run = experiment.start_logging()
print("Starting experiment:", experiment.name)

Starting experiment: session16Experiment


In [22]:
# load the data from a registered dataset
diabetesTabularDataSet = Dataset.get_by_name(ws, name='diabetes tabular dataset')
diabetes=diabetesTabularDataSet.to_pandas_dataframe()

In [23]:
# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Set regularization hyperparameter
reg = 0.01

In [24]:
# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

Training a logistic regression model with regularization rate of 0.01


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  run.log('Regularization Rate',  np.float(reg))


In [25]:
# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

Accuracy: 0.7893333333333333


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  run.log('Accuracy', np.float(acc))


In [26]:
# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

AUC: 0.8568436056949161


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  run.log('AUC', np.float(auc))


In [27]:
# Save the trained model in the outputs folder
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

['outputs/diabetes_model.pkl']

In [28]:
run.complete()

# 5. Register the model


In [29]:
# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
                   tags={'Training context':'Script'},
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})


Model(workspace=Workspace.create(name='session15workspace', subscription_id='17ed2092-9d59-4bde-be0e-18e04a8ef316', resource_group='session16resourcegroup'), name=diabetes_model, id=diabetes_model:2, version=2, tags={'Training context': 'Script'}, properties={'AUC': '0.8568436056949161', 'Accuracy': '0.7893333333333333'})

## You can also view registered models in your workspace on the Models page in Azure Machine Learning studio.