# Integrating Whylogs into your Flask Flow

In [2]:
%pip install -q pandas utils joblib scikit-learn whylogs

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import json
import random
import datetime
import numpy as np
import time
import urllib.request as urllib
import requests
import pandas as pd
from joblib import dump
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

### Step 1: Grab the Data and Prep it

In [4]:
# Download Iris dataset and save it as csv
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
raw_data = urllib.urlopen(url)
try:
    os.mkdir("dataset/")
except Exception as e:
    print(" 'dataset' directory already existed. Moving forward")
# Save data as csv
with open('dataset/Iris.csv', 'wb') as file:
    file.write(raw_data.read())


 'dataset' directory already existed. Moving forward


In [5]:
data = pd.read_csv('dataset/Iris.csv', header=None)

In [6]:
# Separating the independent variables from dependent variables
X = data.iloc[:, 0:4].values
y = data.iloc[:, -1].values
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

### Step 2: Train the Model

In [7]:
# Train a classifier
print("Train started.")
model = SVC()
model.fit(x_train, y_train)
print("Train finished.")
# Save the model
dump(model, 'model.joblib')
print("Model saved as model.joblib")

Train started.
Train finished.
Model saved as model.joblib


### Step 3: Build and Run a Docker Image

In [15]:
!docker build --build-arg PYTHON_VERSION=3.9 -t whylogs-flask .

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                                         
[?25h[1A[0G[?25l[+] Building 0.2s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 37B                                        0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 34B                                           0.0s
[0m => [internal] load metadata for docker.io/library/python:3.9              0.1s
[?25h[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 37B                                        0.0s
[0m[34m => [internal] load .dockerignore                 

Open a terminal and execute the following command:

```bash
docker run --rm -p 5000:5000 whylogs-flask
```


### Step 4: Test Endpoint and Visualize 
TODO: Bring in the NotebookProfileVisualier instead of whylabs

- Go to http://0.0.0.0:5000/apidocs/
- Open /predict endpoint green tab.
- Click Try it out.
- Click Execute green button.
- Check the response and code, if 200, the API is working!


### Step 5: Mess with Data to Showcase a Drift


The following functions aim to modify the variables distribution in order to test whylabs.

In [None]:
def modify_random_column_values(data, value: float = np.random.uniform(low=0.0, high=10.0)) -> None:
    random_column = None
    data_mod = data.copy(deep=True)
    try:
        number_of_columns = len(data_mod.columns) - 2 # Index and label eliminated
        random_column = data_mod.columns[np.random.randint(number_of_columns) + 1]
        data_mod[random_column] = value
    except Exception as ex:
        raise f"Error adding fix value in random column: {str(random_column)}"
    return data_mod
        
        
def add_random_column_outliers(data, number_outliers: int = 10) -> None:
    random_column = None
    data_mod = data.copy(deep=True)
    try:
        number_of_columns = len(data_mod.columns) - 2  # Index and label eliminated
        number_of_rows = data_mod.shape[0]
        random_column = data_mod.columns[np.random.randint(number_of_columns) + 1]
        for i in range(number_outliers):
            random_row = np.random.randint(0, number_of_rows)
            data_mod.loc[random_row, random_column] = round(np.random.uniform(low=20.0, high=50.0), 2)
    except Exception as ex:
        raise f"Error adding outliers in random column: {random_column}"
    return data_mod

Once it's working, you can try to send continous requests to the endpoint:

In [None]:
labels = ["sepal_length_cm", "sepal_width_cm", "petal_length_cm", "petal_width_cm"]

In [None]:
# modify a variable distribution
data_mod = add_random_column_outliers(data, 30)
print("Dataset distribution modified!")

In [None]:
url = "http://0.0.0.0:5000/api/v1"

In [None]:
healthy = requests.get(f"{url}/health")
if healthy.ok:
    for k in range(data_mod.shape[0]):
        # Build a payload with random values
        payload = dict(zip(labels, data_mod.iloc[:, 0:4].values[k]))
        print(payload)
        response = requests.post(f"{url}/predict", json=payload)
        if response.ok:
            print(response.json())
            time.sleep(5)
