# Applied Project in Big Data on Industrial Dataset

## BUSINESS EFFECT OF THE MODEL
## Part I. Demo model

### 1. Libraries

In [None]:
import os
import re
import json
import math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
from sklearn.feature_extraction.text import (
    TfidfVectorizer, 
    CountVectorizer
)
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score, 
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_curve, 
    auc,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    PrecisionRecallDisplay
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    cross_val_score, 
    train_test_split,
    StratifiedKFold
)
pd.set_option('display.max_columns', None)
N_CORES = min(
    multiprocessing.cpu_count(), 
    int(float(os.environ['CPU_LIMIT']))
)
print('cores:', N_CORES)

### 2. Create a small dataset for demo

In [None]:
SAMPLE_SIZE = 2500
BAD_SHARE = .2

In [None]:
df = pd.read_csv('../topic_4/data_white_en.csv')
df = df.sample(SAMPLE_SIZE).reset_index()
del df['index']
df['label'] = 0
print(df.shape)
display(df.head())

In [None]:
df_tmp = pd.read_csv('../topic_4/data_black_en.csv')
df_tmp = df_tmp.sample(int(SAMPLE_SIZE * BAD_SHARE)).reset_index()
del df_tmp['index']
df_tmp['label'] = 1
print(df_tmp.shape)
display(df_tmp.head())

In [None]:
df = df.append(df_tmp)
df.reset_index(inplace=True)
del df['index']
print(df.shape)
display(df.head())

### 3. Simple model

In [None]:
MAX_DF = .975
MIN_DF = 5
NGRAM = (1, 3)

In [None]:
def text_features(data, vectorizer, verbose=True):
    if verbose: print('total texts:', len(data))
    features = vectorizer.fit_transform(data)
    if verbose:
        print(
            'features shape:', features.shape, 
            'max:', np.max(features), 
            'min:', np.min(features)
        )
    return features, vectorizer

In [None]:
%%time
vectorizer=TfidfVectorizer(
    ngram_range=NGRAM, 
    max_df=MAX_DF, 
    min_df=MIN_DF
)
features, vectorizer = text_features(
    df['proc'], 
    vectorizer=vectorizer
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['proc'], 
    df['label'], 
    test_size=.3, 
    stratify=df['label'],
    random_state=2022
)
X_train, vectorizer = text_features(
    X_train, 
    vectorizer=vectorizer
)
X_test = vectorizer.transform(X_test)
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
clf.get_params()

In [None]:
y_score = clf.predict_proba(X_test)

In [None]:
# Scikit-learn Example of Receiver Operating Characteristic (ROC) 
# metric to evaluate classifier output quality.
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1])
roc_auc = auc(fpr, tpr)
roc_auc

In [None]:
prec, recall, _ = precision_recall_curve(
    y_test,
    y_score[:, 1], 
    pos_label=clf.classes_[1]
)
fig, ax = plt.subplots(figsize=(8, 6))
pr_display = PrecisionRecallDisplay(precision=prec, recall=recall)
pr_display.plot(ax=ax)

In [None]:
plt.figure(figsize=(8, 6))
lw = 2
plt.plot(
    fpr,
    tpr,
    color='darkorange',
    lw=lw,
    label='ROC curve (area = %0.2f)' % roc_auc,
)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc='lower right')
plt.show()

In [None]:
THRESHOLD = .1
cm = confusion_matrix(y_test, y_score[:, 1] > THRESHOLD)
cm_display = ConfusionMatrixDisplay(cm)
fig, ax = plt.subplots(figsize=(8, 8))
cm_display.plot(ax=ax)

In [None]:
cm

In [None]:
tp = cm[0, 0]
tn = cm[1, 1]
fp = cm[0, 1]
fn = cm[1, 0]
print(
    ' true positives: ', tp, '\n',
    'true negatives: ', tn, '\n',
    'false positives:', fp, '\n',
    'false negatives:', fn, '\n',
)

### 4. Assumptions

Let's assume:
- payment for one click, POC = money we get from one user's click on a banner
- chance of user's click, CHANCE = not all banners are clicked by users, only part of them 
- fine for a wrong site for a banner, FINE = we are ALWAYS penalized for a wrong recommendation (just assume that fact) 

In [None]:
# all is RUB except chances or probabilities
POC = 100
CHANCE = .05
FINE = 1000

In [None]:
# NOTE:
# model is simplified, we did not consider 
# number of shows or time factor into the formula
# We also did not consider costs, 
# here is part of net income only

fin_effect = tp * POC * CHANCE - fn * FINE
print('financial effect:', fin_effect, 'RUR')

### 5. Optimal values

In [None]:
def model_fin_effect(df, 
                     n_gram, max_df, min_df, 
                     th, 
                     poc, chance, fine):
    
    # --- vectorizer ---
    
    vectorizer=TfidfVectorizer(
        ngram_range=NGRAM, 
        max_df=max_df, 
        min_df=min_df
    )
    
    # --- train test split ---
    
    X_train, X_test, y_train, y_test = train_test_split(
        df['proc'], 
        df['label'], 
        test_size=.3, 
        stratify=df['label'],
        random_state=2022
    )
    X_train, vectorizer = text_features(
        X_train, 
        vectorizer=vectorizer,
        verbose=False
    )
    X_test = vectorizer.transform(X_test)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # --- predictions and metrics ---
    
    y_score = clf.predict_proba(X_test)
    fpr, tpr, ths_rocauc = roc_curve(y_test, y_score[:, 1])
    roc_auc = auc(fpr, tpr)
    prec, recall, ths_pr = precision_recall_curve(
        y_test,
        y_score[:, 1], 
        pos_label=clf.classes_[1]
    )
    cm = confusion_matrix(y_test, y_score[:, 1] > th)
    
    # --- fin effect ---
    
    tp = cm[0, 0]
    tn = cm[1, 1]
    fp = cm[0, 1]
    fn = cm[1, 0]
    fin_effect = tp * poc * chance - fn * fine
    
    return fin_effect, fpr, tpr, ths_rocauc, prec, recall, ths_pr

In [None]:
# all parameters
MAX_DF = .975
MIN_DF = 15
NGRAM = (1, 2)
THRESHOLD = .1
POC = 100
CHANCE = .05
FINE = 500

# get effect
fin_effect, fpr, tpr, ths_rocauc, prec, recall, ths_pr = model_fin_effect(
    df=df,
    n_gram=NGRAM, max_df=MAX_DF, min_df=MIN_DF,
    th=THRESHOLD, 
    poc=POC, chance=CHANCE, fine=FINE
)
print('financial effect:', fin_effect, 'RUR')

In [None]:
%%time
plt.figure(figsize=(16, 8))
plt.xlabel('threshold')
plt.ylabel('financial effect, RUR')

ths = np.linspace(0, 1, 21)
fin_effs = []
fin_eff_max = -math.inf
best_th = 0

for th in tqdm(ths):
    fin_effect, fpr, tpr, ths_rocauc, prec, recall, ths_pr = model_fin_effect(
        df=df,
        n_gram=NGRAM, max_df=MAX_DF, min_df=MIN_DF,
        th=th, 
        poc=POC, chance=CHANCE, fine=FINE
    )
    if fin_effect > fin_eff_max:
        fin_eff_max = fin_effect
        best_th = th
    fin_effs.append(fin_effect)           

plt.title(f'Max financial effect {fin_eff_max:.1f} RUR with threshold {best_th:.2f}')
plt.plot(ths, fin_effs, color='b')
plt.scatter(best_th, fin_eff_max, c='red')
plt.annotate(f'{fin_eff_max:.1f} RUR', xy=(best_th, fin_eff_max))
plt.show()

### 6. Recommendations

Here are some:
- to speed up the process: use saved models to make a prediction, do not train them from the scratch
- be careful with train-test split, use data that models never see
- connect metrics and financial effect to show business how metrics' improvement affects financial results