We will divide this example in two stages: Pre-deployment Stage and Production Stage.

In the __Pre-deployment Stage__ we will:
- train a classifier
- calculate the centroids for each topic cluster

In the __Production Stage__ we will:
- load daily batches of data
- vectorize the data
- predict the topic for each document
- log:
    - embeddings distance to the centroids
    - tokens list for each document
    - predictions and targets

In the Production Stage, we will introduce documents in another language (Spanish) to see how the model behaves, and how we can monitor this with WhyLabs.

## Installing Dependencies

In [1]:
# !pip install scikit-learn==1.0.2 whylogs==1.1.31 whylabs-client==0.4.2

## ✔️ Setting the Environment Variables


In [2]:
# import getpass
# import os

# # set your org-id here - should be something like "org-xxxx"
# print("Enter your WhyLabs Org ID") 
# os.environ["WHYLABS_DEFAULT_ORG_ID"] = input()

# # set your datased_id (or model_id) here - should be something like "model-xxxx"
# print("Enter your WhyLabs Dataset ID")
# os.environ["WHYLABS_DEFAULT_DATASET_ID"] = input()


# # set your API key here
# print("Enter your WhyLabs API key")
# os.environ["WHYLABS_API_KEY"] = getpass.getpass()
# print("Using API Key ID: ", os.environ["WHYLABS_API_KEY"][0:10])

## Pre-deployment

### Training the model

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from whylogs.experimental.preprocess.embeddings.selectors import PCACentroidsSelector
from sklearn.naive_bayes import MultinomialNB

In [4]:
categories = [
    "alt.atheism",
    "soc.religion.christian",
    "comp.graphics",
    "rec.sport.baseball",
    "talk.politics.guns",
    "misc.forsale",
    "sci.med",
]

twenty_train = fetch_20newsgroups(
    subset="train", remove=("headers", "footers", "quotes"), categories=categories, shuffle=True, random_state=42
)

vectorizer = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
    ]
)
vectors_train = vectorizer.fit_transform(twenty_train.data)

vectors_train = vectors_train.toarray()

clf = MultinomialNB(alpha=0.01)
clf.fit(vectors_train, twenty_train.target)

MultinomialNB(alpha=0.01)

### Calculating Reference Embeddings

In [5]:
references, labels = PCACentroidsSelector(n_components=20).calculate_references(vectors_train, twenty_train.target)
ref_labels = [twenty_train.target_names[x].split(".")[-1] for x in labels]
print(ref_labels)

['atheism', 'graphics', 'forsale', 'baseball', 'med', 'christian', 'guns']


## Production Stage

### Configuring Schema for Embeddings+Tokens+Performance logging

In [6]:
import whylogs as why
from whylogs.core.resolvers import MetricSpec, ResolverSpec
from whylogs.core.schema import DeclarativeSchema
from whylogs.experimental.extras.embedding_metric import (
    DistanceFunction,
    EmbeddingConfig,
    EmbeddingMetric,
)
from whylogs.experimental.extras.nlp_metric import BagOfWordsMetric
from whylogs.core.resolvers import STANDARD_RESOLVER


config = EmbeddingConfig(
    references=references,
    labels=ref_labels,
    distance_fn=DistanceFunction.cosine,
)
embeddings_resolver = ResolverSpec(column_name="news_centroids", metrics=[MetricSpec(EmbeddingMetric, config)])
tokens_resolver = ResolverSpec(column_name="document_tokens", metrics=[MetricSpec(BagOfWordsMetric)])

schema = DeclarativeSchema(STANDARD_RESOLVER+[embeddings_resolver,tokens_resolver])

### Loading daily batches

In [7]:
prod_df = pd.read_csv("prod_df_20ng.csv")
prod_df.head()

Unnamed: 0,doc,target,predicted,tokens,language,batch_id,doc_id
0,Hello\n\n Just one quick question\n ...,4,4,"['Hello', 'Just', 'one', 'quick', 'question', ...",en,0,0.0
1,OFFICIAL UNITED NATIONS SOUVENIR FOLDERS\n\nEa...,2,2,"['OFFICIAL', 'UNITED', 'NATIONS', 'SOUVENIR', ...",en,0,1.0
2,I am selling Joe Montana SportsTalk Football 9...,2,2,"['I', 'selling', 'Joe', 'Montana', 'SportsTalk...",en,0,2.0
3,\n\nNonsteroid Proventil is a brand of albute...,4,4,"['Nonsteroid', 'Proventil', 'brand', 'albutero...",en,0,3.0
4,Two URGENT requests\n\n1 I need the latest upd...,6,6,"['Two', 'URGENT', 'requests', '1', 'I', 'need'...",en,0,4.0


### Language Perturbation - Spanish Documents

In [8]:
language_perturbation_ratio = [0,0,0,0,0.33,0.66,1]

def get_docs_by_language_ratio(batch_df, ratio):
    n_docs = len(batch_df[batch_df["language"] == "en"])
    n_es_docs = int(n_docs * ratio)
    n_en_docs = n_docs - n_es_docs
    en_df = batch_df[batch_df["language"] == "en"].sample(n_en_docs)    
    es_df = batch_df[~batch_df['doc_id'].isin(en_df["doc_id"])]
    # filter out docs with doc_id in en_df


    es_df = es_df[es_df["language"] == "es"].sample(n_es_docs)
    docs = pd.concat([en_df, es_df])
    return docs

### Log and Upload to WhyLabs

In [16]:
from datetime import datetime,timedelta, timezone
import whylogs as why



for day, batch_df in prod_df.groupby("batch_id"):
    dataset_timestamp = datetime.now() - timedelta(days=day)
    dataset_timestamp = dataset_timestamp.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo = timezone.utc)

    print(f"day {day}: {dataset_timestamp}")

    ratio = language_perturbation_ratio[day]
    print(f"{ratio*100}% of documents with language perturbation")
 
    mixed_df = get_docs_by_language_ratio(batch_df, ratio)
    mixed_df = mixed_df.dropna()

    vectors = vectorizer.transform(mixed_df['doc']).toarray()
    predicted = clf.predict(vectors)
    print("mean accuracy: ", np.mean(predicted == mixed_df['target']))


    profile = why.log(row={"news_centroids": vectors,
                                     "document_tokens": mixed_df["tokens"]},
                                     schema=schema)
    profile.set_dataset_timestamp(dataset_timestamp)

    output_df = pd.DataFrame({"output_target": mixed_df["target"],
                            "output_prediction": predicted})

    results = why.log_classification_metrics(
        output_df,
        target_column="output_target",
        prediction_column="output_prediction",
    )
    results.set_dataset_timestamp(dataset_timestamp)    

    # results.writer("whylabs").write()
    


day 0: 2023-03-30 00:00:00+00:00
0% of documents with language perturbation
mean accuracy:  0.8509485094850948
day 1: 2023-03-29 00:00:00+00:00
0% of documents with language perturbation
mean accuracy:  0.8310991957104558
day 2: 2023-03-28 00:00:00+00:00
0% of documents with language perturbation
mean accuracy:  0.8306451612903226
day 3: 2023-03-27 00:00:00+00:00
0% of documents with language perturbation
mean accuracy:  0.8118279569892473
day 4: 2023-03-26 00:00:00+00:00
33.0% of documents with language perturbation
mean accuracy:  0.6380697050938338
day 5: 2023-03-25 00:00:00+00:00
66.0% of documents with language perturbation
mean accuracy:  0.4702702702702703
day 6: 2023-03-24 00:00:00+00:00
100% of documents with language perturbation
mean accuracy:  0.2923497267759563
