# Data Science and Machine Learning Prototype Work

In [1]:
from datetime import datetime
from urllib.request import urlopen

import boto3 as aws
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.metrics import f1_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

## Download Data

In [2]:
data_url = 'http://bodywork-ml-ops-project.s3.eu-west-2.amazonaws.com/data/iris_classification_data.csv'
data = pd.read_csv(urlopen(data_url))
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## Data Preparation

In [3]:
feature_columns = [
    'sepal length (cm)',
    'sepal width (cm)',
    'petal length (cm)',
    'petal width (cm)'
]

label_column = 'species'
species_to_class_map = {'setosa': 0, 'versicolor': 1, 'virginica': 2}

X = data[feature_columns].values
y = data[label_column].apply(lambda e: species_to_class_map[e]).values

## Split Data into Train and Test Subsets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1,
    stratify=y,
    random_state=42
)

## Define Task Metrics

In [5]:
def log_metrics_summary(y_actual, y_predicted):
    time_now = datetime.now().isoformat(timespec='seconds')
    accuracy = balanced_accuracy_score(
        y_actual,
        y_predicted,
        adjusted=True
    )
    f1 = f1_score(
        y_actual,
        y_predicted,
        average='weighted'
    )
    print(f'iris model metrics @{time_now}')
    print(f' |-- accuracy = {accuracy:.3f}')
    print(f' |-- f1 = {f1:.3f}')

## Train Model

In [6]:
iris_tree_classifier = DecisionTreeClassifier(
    class_weight='balanced',
    random_state=42
)
iris_tree_classifier.fit(X_train, y_train)

log_metrics_summary(y_test, iris_tree_classifier.predict(X_test))

iris model metrics @2020-12-06T15:17:57
 |-- accuracy = 0.800
 |-- f1 = 0.867


## Persist Model Locally

In [10]:
model_filename = 'iris_tree_classifier.joblib'
dump(iris_tree_classifier, model_filename)

['iris_tree_classifier.joblib']

## Upload Model to AWS S3

In [12]:
s3_bucket_name = 'bodywork-ml-ops-project-th'
try:
    s3_client = aws.client('s3')
    s3_client.upload_file(
        model_filename,
        s3_bucket_name,
        f'models/{model_filename}'
    )
except Exception as e:
    print('could not upload model to S3 - check AWS credentials')

## Test that Model Loads and Generates Predictions

In [19]:
model_url = 'https://bodywork-ml-ops-project-th.s3-us-west-1.amazonaws.com/models/iris_tree_classifier.joblib'
model = load(urlopen(model_url))

class_species_to_map = {v: k for k, v in species_to_class_map.items()}
test_data_instance = np.array([5.1, 3.5, 1.4, 0.2], ndmin=2)
class_prediction = int(model.predict(test_data_instance)[0])
class_probabilities = model.predict_proba(test_data_instance)[0]
species_prediction = class_species_to_map.get(class_prediction)
species_probabilities = {
    k: v
    for k, v in zip(species_to_class_map.keys(), class_probabilities)
}
print(f'predicted iris species = {species_prediction}')
print(f'species probabilities = {species_probabilities}')

HTTPError: HTTP Error 403: Forbidden