<a href="https://colab.research.google.com/github/superwise-ai/quickstart/blob/itay%40-2147955467-opensource_repo/examples/retraining_notebook.ipynb#offline=true&sandboxMode=true" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
## 🚀 Setup

Start out by preparing requirements before starting getting insghits:
1. Install with dependenceis
2. import libraries and declare parameters 
3. Get Superwise Token
---

**🪄 Declare on Params for this Notebook**

In [None]:
ACCOUNT_NAME = ""
CLIENT_ID = ''
SECRET = ''
MODEL_NAME = ''
VERSION_NAME = ''
FROM_DATE = ''
TO_DATE = ''

**Install And Imports**

In [None]:
%%sh
pip install -q "jupyterlab>=3" "ipywidgets>=7.6"
pip install jupyter-plotly-dash jupyter-dash dash==2.1.0
pip install -q plotly==5.4.0
pip install tornado

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import seaborn as sns
import numpy as np
from dash import Dash, dash_table
import pandas as pd
import datetime
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from scipy.cluster import hierarchy as hc
from matplotlib import colors as mat_colors
from matplotlib.gridspec import GridSpec
import plotly.graph_objects as go
from matplotlib import pyplot as plt


pd.options.plotting.backend = "plotly"
plt.rcParams["figure.figsize"] = [24,10]
plt.rcParams["figure.autolayout"] = True
import requests

In [None]:
URL_PREFIX = f"https://portal.superwise.ai/{ACCOUNT_NAME}"

**Get Token**

In [None]:
url = "https://auth.superwise.ai/identity/resources/auth/v1/api-token"

headers = {
    "Accept": "application/json",
    "Content-Type": "application/json"
}
payload = {
    "clientId": CLIENT_ID,
    "secret": SECRET
}

res = requests.post(url, json=payload, headers=headers)
res.raise_for_status()
token = res.json()['accessToken']
HEADERS = {"Authorization": f"Bearer {token}"}

**Extract model_id and version_id** - Hack should be remove 

In [None]:
request_url  = f'{URL_PREFIX}/admin/v1/models?name={MODEL_NAME}'
res = requests.get(request_url, headers=HEADERS)
res.raise_for_status()
MODEL_ID=res.json()[0]['id']

In [None]:
request_url  = f'{URL_PREFIX}/model/v1/versions?task_id={MODEL_ID}'
res = requests.get(request_url, headers=HEADERS)
res.raise_for_status()
VERSION_ID = res.json()[0]['id']

# 🏃 Retraining Notebook 
By analyzing model production behavior, this notebook helps you to better understand your model data and decision, and how and when to retrain your models.  
The Notebook covers the following topics:  

**[1️⃣. Data Dna + Clustering 🧬](#dna)**

**[2️⃣. Exclude days based on incidents amount 🆘 ](#incident)**

**[3️⃣. Most drifted features (relative to baseline) 📈](#drift)**

**[4️⃣. Most drifted segments (relative to baseline) 🌎 ](#segment)**



<a id='dna'></a>
## 1️⃣. 🧬 Data DNA Insights
Data DNA will gives you the understading on how the data is changed between dates and suggest potential datasets clusters based on similar distribution.

In [None]:
def plotMatrix(matrix, ax=None, title='Distance Matrix', cbar=True, show_ticks=True, figsize=(20,12)):
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)
        
    scale_max_val = max(0.1, matrix.max().max())
    
    if show_ticks:
        show_ticks = 'auto'
    
    mat = sns.heatmap(matrix, square=True, annot=False, xticklabels=show_ticks, yticklabels=show_ticks,
                      vmax=scale_max_val, vmin=0, cmap="Blues", ax=ax)
    
    
    if title:
        ax.set_title(title)

In [None]:
DNA_URL = f"{URL_PREFIX}/dna/v1/dna?task_id={MODEL_ID}"
params = {
    'version_id': VERSION_ID,
    "from_ts": FROM_DATE,
    "to_ts": TO_DATE
}
res = requests.get(DNA_URL, headers=HEADERS, params=params)
res.raise_for_status()
matrix = json.loads(res.content)['results']

In [None]:
matrix = pd.DataFrame(
    matrix['records'], 
    index=pd.to_datetime(matrix['dates'], unit='s').date, 
    columns=pd.to_datetime(matrix['dates'], unit='s').date
)

In [None]:
plotMatrix(matrix)

#### 👥  Create hierarchical groups

In [None]:
corr_condensed = hc.distance.squareform(matrix)
print(corr_condensed.shape)
z = hc.linkage(corr_condensed, method='complete')
print(f"Amount of groups - {z.shape}")
plt.figure(figsize=(20, 5))
plt.rcParams.update({'font.size': 10})
dendrogram = hc.dendrogram(z, labels=matrix.columns, color_threshold=0.03)
plt.show() 

In [None]:
clusters = hc.fcluster(z, 1.0, criterion='distance')
clusters.shape

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable
def plot_data_dna(matrix, clusters):
    colors_map = {i:c for i,c in enumerate(plt.rcParams['axes.prop_cycle'].by_key()['color'])}
    cluster_bar = pd.Series(clusters, index=matrix.index).map(colors_map)

    fig, (ax1, ax2) = plt.subplots(2, 1, gridspec_kw={'height_ratios':[20, 1]}, figsize=(15,12), sharex=False)

    sns.heatmap(matrix, ax= ax1, cmap='Blues')
    ax1.tick_params(axis='x', direction='out', pad=40)

    p1 = ax1.get_position().bounds
    p2 = ax2.get_position().bounds
    p2_new = list(p2)
    p2_new[0] = p1[0]
    p2_new[1] = p1[1] - p2[3] - 0.01
    p2_new[2] = p1[2]
    ax2.set_position(p2_new)

    ax2.bar(x=matrix.index,height=[1]*matrix.shape[0], width=1.0, color=cluster_bar)
    ax2.margins(0.00)  
    ax2.tick_params(labelbottom=False, bottom=False)

    _ = plt.xticks(rotation=90)
    
    return fig


#### 👥 Find different groups of dates with similar data

In [None]:
plot_data_dna(matrix, clusters)

#### 👥 Retraining Groups

In [None]:
groups_df = pd.DataFrame(data={"Dates" : list(matrix.columns), "Group" : clusters}).groupby('Group').agg(list).reset_index()
groups_df

<a id='incident'></a>
## 2️⃣. 🆘 Exclude days based on incidents amount

This will gives you the ability to understand if the data in this day should be used in the retraiing of the new model.     

Parameter to Set:    

`THRESHOLD` - this parameter will color days in red with more incidents than the THRESHOLD. (Default: 5)

In [None]:
THRESHOLD = 5

In [None]:
url = f"{URL_PREFIX}/monitor/v1/incidents?task_id={MODEL_ID}&status=Open"
res = requests.get(url,headers=HEADERS)
opened_incidents_df = pd.DataFrame(res.json())
opened_incidents_df.set_index('id',inplace=True)
url = f"{URL_PREFIX}/monitor/v1/incidents?task_id={MODEL_ID}&status=Closed"
res = requests.get(url,headers=HEADERS)
closed_incidents_df = pd.DataFrame(res.json())
closed_incidents_df.set_index('id',inplace=True)
incidents_df = pd.concat([opened_incidents_df,closed_incidents_df])


In [None]:
incidents_ids = incidents_df.index.to_list()
incidents_df = list()

for incidents_id in incidents_ids:
  url = f"{URL_PREFIX}/monitor/v1/incidents/{incidents_id}"  
  res = requests.get(url,headers=HEADERS)
  res.raise_for_status()
  incidents_df.append(res.json())

incidents_df = pd.DataFrame(incidents_df)

In [None]:
incidents_df["started_at"] = pd.to_datetime(incidents_df["started_at"], unit='s') 
incidents_df.loc[incidents_df['ended_at'].notna(), 'ended_at'] = pd.to_datetime(incidents_df[incidents_df['ended_at'].notna()]["ended_at"], unit='s') 
incidents_df.loc[incidents_df['ended_at'].isna(), 'ended_at'] = datetime.datetime.utcnow().date()

In [None]:
incidents_df = incidents_df[incidents_df['started_at']> FROM_DATE]

In [None]:
incidents_df['date'] = incidents_df.apply(lambda row: pd.date_range(row['started_at'], row['ended_at'], freq='D'), axis=1)
incidents_df = incidents_df.explode('date').reset_index() \
        .drop(columns=['started_at', 'ended_at']) \
        .rename(columns={'date': 'date_start'}) \

In [None]:
opened_incidents_df = incidents_df['date_start'].dt.date.value_counts().sort_values(ascending=False).to_frame().reset_index().rename(columns={'index' : 'date'})

#### Amount of opened incidents over time

In [None]:
values = np.array(list(opened_incidents_df['date_start'].tolist()))   
idx = np.array(list(opened_incidents_df['date'].tolist())) 
clrs = ['red' if (x > THRESHOLD) else 'green' for x in values ]
plt.bar(idx, values, color=clrs)
plt.show()

#### Top 10 Dates with the highest amount of incidents

In [None]:
top_ten_dates_df = incidents_df['date_start'].dt.date.value_counts().sort_values(ascending=False).to_frame().head(10).reset_index().rename(columns={'index' : 'date'})

In [None]:
top_ten_dates_df

<a id='drift'></a>
## 3️⃣. 📈 Most drifted features

Parameter to Set:    

`THRESHOLD` - this parameter will color features that are more drifted than threshold in red (Default: 5)

THRESHOLD = *5*

In [None]:
THRESHOLD = 5

In [None]:
request_url  = f'{URL_PREFIX}/model/v1/versions/{VERSION_ID}/data_entities'
res = requests.get(request_url, headers=HEADERS)
res.raise_for_status()

features = pd.DataFrame(res.json())
version_entities = pd.DataFrame(res.json(), columns=["data_entity", "feature_importance"])
flatten_version_entities = pd.json_normalize(version_entities["data_entity"], max_level=0)
flatten_version_entities["feature_importance"] = version_entities["feature_importance"]
empty_flatten_version_entities = pd.DataFrame(
    columns=["id", "name", "role", "type", "secondary_type", "summary", "dimension_start_ts"]
)
features = empty_flatten_version_entities.append(flatten_version_entities)
features['role'] = pd.json_normalize(features['role'])['description']
features = features[features['role'] == 'feature']
features = features[["id","name","type","feature_importance"]]

features.head()

In [None]:
ENTITY_IDS= features.id.to_list()
print(len(ENTITY_IDS))

In [None]:
request_url  = f'{URL_PREFIX}/kpi/v1/metrics-functions'
res = requests.get(request_url, headers=HEADERS)
res.raise_for_status()
metrics = pd.DataFrame(res.json()) 
metrics.head()

In [None]:
METRIC_ID=metrics.set_index('name').loc['distribution_shift']['id'] 
METRIC_ID

In [None]:
request_url  = f'{URL_PREFIX}/kpi/v1/metrics'
requests_params = dict(task_id=MODEL_ID, vesrion_id=VERSION_ID, entity_id=ENTITY_IDS, segment_id=-1, metric_id=[METRIC_ID], time_unit='D')
res = requests.get(request_url,params=requests_params,headers=HEADERS)
res.raise_for_status()
results_df =  pd.DataFrame(res.json())
results_df['entity_name'] = results_df['entity_id'].map(features.set_index('id')['name'].to_dict())
results_df['date_hour'] = pd.to_datetime(results_df['date_hour'])
results_df.head()

In [None]:
results_df = results_df[results_df['date_hour'].max() == results_df['date_hour']]
results_df = results_df[['entity_name','value']]
results_df = results_df.sort_values('value', ascending=False)

#### Drift score per feature

In [None]:
results_df.sort_values('value', ascending=False).set_index('entity_name').plot.bar()

<a id='segment'></a>
##4️⃣. 🌎 Most drifted segments

Parameter to Set:    

`THRESHOLD` - this parameter will color features that are more drifted than threshold in red (Default: 5)

In [None]:
THRESHOLD = 12.5

In [None]:
METRIC_ID=metrics.set_index('name').loc['quantity']['id'] 
METRIC_ID

In [None]:
request_url  = f'{URL_PREFIX}/kpi/v1/metrics'
requests_params = dict(task_id=MODEL_ID, vesrion_id=VERSION_ID, entity_id=-1, metric_id=[METRIC_ID], time_unit='D')
res = requests.get(request_url,params=requests_params,headers=HEADERS)
res.raise_for_status()
results_df =  pd.DataFrame(res.json())
results_df['entity_name'] = results_df['entity_id'].map(features.set_index('id')['name'].to_dict())
results_df['date_hour'] = pd.to_datetime(results_df['date_hour'])
results_df = results_df[['segment_id','value']]
results_df = results_df.groupby('segment_id').agg('sum').reset_index()
results_df['segment_size'] = results_df['value'] / float(results_df['value'].sum()) 
results_df = results_df[['segment_id','segment_size']]
results_df.head()

In [None]:
request_url  = f'{URL_PREFIX}/admin/v1/segments?task_id={MODEL_ID}'
res = requests.get(request_url,params=requests_params,headers=HEADERS)
res.raise_for_status()
segments_df = pd.DataFrame(res.json())
segments_df = segments_df[['id','name']].rename(columns={'id': 'segment_id'})
segments_df.head(5)

In [None]:
segments_df = pd.merge(segments_df, results_df, how='left', on=['segment_id'],suffixes=['_segments','_kpis'])
segments_df.head()

In [None]:
metrics.set_index('name').loc['data_drift'] 
METRIC_ID=metrics.set_index('name').loc['data_drift']['id'] 
METRIC_ID

In [None]:
request_url  = f'{URL_PREFIX}/kpi/v1/metrics'
requests_params = dict(task_id=MODEL_ID, vesrion_id=VERSION_ID, entity_id=-1, metric_id=[METRIC_ID], time_unit='D')
res = requests.get(request_url,params=requests_params,headers=HEADERS)
res.raise_for_status()
results_df =  pd.DataFrame(res.json())
results_df['entity_name'] = results_df['entity_id'].map(features.set_index('id')['name'].to_dict())
results_df['date_hour'] = pd.to_datetime(results_df['date_hour'])
results_df.head()

In [None]:
results_df = results_df.groupby('segment_id').mean().reset_index()[["segment_id","value"]]
results_df.head()

In [None]:
segments_df = pd.merge(segments_df,results_df,how='left').fillna(0)
segments_drift_df = segments_df[["name","value"]]
segments_drift_df.head()

#### Mean Segment Drift 

In [None]:
from matplotlib import pyplot as plt

segments_drift_df.sort_values('value',ascending=False).set_index('name').plot.bar()

#### Realtionship between segment size and segment drift

In [None]:
import matplotlib.pyplot as plt
import numpy as np
x = segments_df['segment_size'].to_list()
y = segments_df['value'].to_list()
colors = list(map(lambda x: "red" if x > THRESHOLD else "green", y))
labels = segments_df['name'].to_list()
sns.scatterplot(data=segments_df,x='segment_size',y='value')
for i in range(segments_df.shape[0]):
 plt.text(s=labels[i],x=x[i],y=y[i],
          fontdict=dict(color=colors[i],size=10))
plt.xlabel("Segment Size") #x label
plt.ylabel("Drift Value") #y label

plt.show()