# Azure Machine Learning Workspaces

This will be a general guideline for starting a workspace instance and experiment

* Workspace is a virtual environment for running machine learning experiments, computing targets, loading data, sharing notebooks, creating pipelines/models, etc.
* Microsoft Azure --> Subscription --> Resource Group --> Workspace 
* Workspaces contains all the context for which you will work in

In [None]:
# Step 1: Creating a Workspace
# Create a new compute instance and specify subscription, 
# resource group, workspace name, and workspace edition
from azureml.core import Workspace 

# Manual input
ws = Workspace.create(name='[Insert name]',
                      subscription_id='[sub id]',
                      resource_group='[resource group]',
                      create_resource_group=True,
                      location='centralus',
                      sku='enterprise')

# If you already configured everything then use below
we = Workspace.from_config()
print(ws.name, "loaded")

In [None]:
# Step 2: Run an Experiment
# A sample diabetes dataset to show how to run an Azure ML experiment and record experiment outputs
from azureml.core import Experiment
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Create an Azure ML experiment in workspace
experiment = Experiment(workspace = ws, name = "diabetes-experiment")

# Start logging data from experiment 
run = experiment.start_logging()
print("Starting experiment:", experiment.name)

# Load the data from a local file in workspace
data = pd.read_csv('data/diabetes.csv')

# Count the rows and log the results
row_count = (len(data))
run.log("Observation numbers:", row_count) 
print('Analyzing {} rows of data'.format(row_count))

# Plot and log the count of diabetics vs non-diabetic patients
diabetic_counts = data['Diabetic'].value_counts()
fig = plt.figure(figsize=(6,6))
ax = fig.gca()
diabetic_counts.plot.bar(ax=ax)
ax.set_title('Patients with Diabetes')
ax.set_xlabel('Diagnosis')
ax.set_ylabel("Number of Patients")
plt.show()
run.log_image(name = 'diabetes label distribution', plot = fig) # logging figure

# Log distinct pregnancy counts
pregnancies = data.Pregnacies.unique()
run.log_list('pregnancy categories', pregnancies)

# Log summary statistics for numeric columns
med_columns = ['PlasmaGlucose', 'DiastolicBloodPressure', 'TricepsThickness', 'SerumInsulin',
               'BMI']
summary_stats = data[med_columns].describe().to_dict() # returns mean, std, iqr in dict format
for col in summary_stats: 
    keys = list(summary_stats[col].keys)
    values = list(summary_stats[col].values())
    for index in range(len(keys)):
        run.log_row(col, stat=keys[index], value=values[index])

# Save sample of the data and upload it to experimental output
data.sample(100).to_csv('sample.csv', index=False, header=True)
run.upload_file(name = 'outputs/sample.csv', path_or_stream = './sample.csv')

# Complete the experiment run
run.complete()

In [None]:
# Step 3: View Experiment Results
## Option 1
import json

# get run details
details = run.get_details()
print(details)

# Get logged metrics
metrics = run.get_metrics()
print(json.dumps(metrics, indent = 2))

# Get output files
files = run.get_file_names()
print(json.dumps(files, indent=2))

## Option 2: In jupyter notebooks
from azureml.widgets import RunDetails
RunDetails(run).show() # Gives a link to go click and view all experiment outputs

In [None]:
# Step 4: Running a script as an experiment
## Often the best way for running an experiment. It allows for repeated runs that will allow for 
## output/model comparisions 

import os, shutil

# Create a folder for the experiment files
folder_name = 'diabetes-experiment-files'
experiment_folder = "./" + folder_name
os.makedirs(folder_name, exist_ok=True)

# Copy the data file into the experiment folder
shutil.copy('data/diabetes.csv', os.path.join(folder_name, "diabetes.csv"))

In [None]:
# Creating python script for experiment run and saving it to experiment folder
%%writefile $folder_name/diabetes_experiment.py
from azureml.core import Run
import pandas as pd
import os

#  Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
data = pd.read_csv('diabetes.csv')

# Count the rows and log the result
row_count = (len(data))
run.log('observations', row_count)
print('Analyzing {} rows of data'.format(row_count))

# Count and log the label counts
diabetic_counts = data['Diabetic'].value_counts()
print(diabetic_counts)
for k, v in diabetic_counts.items():
    run.log('Label:' + str(k), v)
      
# Save a sample of the data in the outputs folder (which gets uploaded automatically)
os.makedirs('outputs', exist_ok=True)
data.sample(100).to_csv("outputs/sample.csv", index=False, header=True)

# Complete the run
run.complete()

In [None]:
# Set up the script configuration and submit the experiment
import os
import sys
from azureml.core import Experiment, RunConfiguration, ScriptRunConfig
from azureml.widgets import RunDetails

# Create a new RunConfig object
experiment_run_config = RunConfiguration()

# Create a script config
src = ScriptRunConfig(source_directory = experiment_folder,
                      script = 'Diabetes_experiment.py',
                      run_config=experiment_run_config)

# submit the experiment
experiment = Experiment(workspace = ws, name = 'diabetes-experiment')
run = experiment.submit(config=src)
RunDetails(run).show()
run.wait_for_completion()

# Get logged metrics
metrics = run.get_metrics()
for key in metrics.keys():
        print(key, metrics.get(key))
print('\n')
for file in run.get_file_names():
    print(file)

# Retrieve experiment history
from azureml.core import Experiment, Run

diabetes_experiment = ws.experiments['diabetes-experiment']
for logged_run in diabetes_experiment.get_runs():
    print('Run ID:', logged_run.id)
    metrics = logged_run.get_metrics()
    for key in metrics.keys():
        print('-', key, metrics.get(key))

# Step 5: Stop the instance!!

Important to close and halt the notebook, close all Jupyter tabs in browser, and stop all compute instance OR ELSE costs will continue to incur regardless. 