##Data Load and Pre-processing

In [None]:
# Libraries Install and Import

import sys
!{sys.executable} -m pip install scikit-uplift catboost scikit-learn seaborn matplotlib pandas numpy
from sklift.datasets import fetch_lenta
from sklift.models import ClassTransformation
from sklift.metrics import uplift_at_k
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
%matplotlib inline



In [None]:
# Dataset Load

dataset = fetch_lenta()

print(f"Dataset type: {type(dataset)}\n")
print(f"Dataset features shape: {dataset.data.shape}")
print(f"Dataset target shape: {dataset.target.shape}")
print(f"Dataset treatment shape: {dataset.treatment.shape}")
dataset.keys()

treat_dict = {
    'test': 1,
    'control': 0
}
dataset.treatment = dataset.treatment.map(treat_dict)

gender_dict = {
    'M': 1,
    'Ж': 0
}
dataset.data.gender = dataset.data.gender.map(gender_dict)

Dataset type: <class 'sklearn.utils._bunch.Bunch'>

Dataset features shape: (687029, 193)
Dataset target shape: (687029,)
Dataset treatment shape: (687029,)


In [None]:
# Missing Values Imputation
# With mean values

for h in dataset.data:
  col = dataset.data[h]
  m = np.mean(col)
  col = np.nan_to_num(col, nan=m)
  dataset.data[h] = col

print(dataset.data.isna().sum().sum())

0


In [None]:
# Data Normalization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
Data = scaler.fit_transform(dataset.data)

In [None]:
# Data Split
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


stratify_cols = pd.concat([dataset.treatment, dataset.target], axis=1)

X_UP_Train, X_UP_Test, T_UP_Train, T_UP_Test, Y_UP_Train, Y_UP_Test = train_test_split(
    dataset.data,
    dataset.treatment,
    dataset.target,
    stratify=stratify_cols,
    test_size=0.3,
    random_state=42
)

print(f"Train shape: {X_UP_Train.shape}")
print(f"Test shape: {X_UP_Test.shape}")

Train shape: (480920, 193)
Test shape: (206109, 193)


In [None]:
# Treatment pack for response

X_Train = X_UP_Train.copy()
X_Test = X_UP_Test.copy()

X_Train.insert(0, 'treatment', T_UP_Train, True)
X_Test.insert(0, 'treatment', T_UP_Test, True)

In [None]:
# Response Model Evaluation
import matplotlib.pyplot as plt
import math

# ==== Accuracy Metrics ====
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

def normalize(value):
  return np.round(abs(value))

def accuracy(m, x, y):
  y_pred = normalize(m.predict(x))
  res = [accuracy_score(y, y_pred), precision_score(y, y_pred, average='weighted'), recall_score(y, y_pred, average='weighted'),  (2 * precision_score(y, y_pred, average='weighted') * recall_score(y, y_pred, average='weighted')) / (precision_score(y, y_pred, average='weighted') + recall_score(y, y_pred, average='weighted')) ]
  if(hasattr(m, "decision_function")):
    res.append(roc_auc_score(y, m.decision_function(x)))
  else:
    res.append(math.nan)
  return res

# ===== Model Evaluation ====
def evaluate_response(m):
  m = m.fit(X_Train, Y_UP_Train)
  acc = accuracy(m, X_Test, Y_UP_Test)
  return acc

# ===== Table Display ====
from IPython.display import display

def display_response(models): # [name, model]
  values = []
  for mod in models:
    print(mod[0])
    acc = [mod[0]]
    acc += evaluate_response(mod[1])
    values.append(acc)

  df = pd.DataFrame(values, columns=['model', 'accuracy', 'precision', 'recall', 'f1', 'roc auc',])
  display(df)

In [None]:
# Response Models Definition
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

ResponseModels = [
    ["Log Reg", LogisticRegression()],
    ["Decision Tree", DecisionTreeClassifier(max_depth=5, random_state=42)],
    ["Random Forest", RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=42)],
    ["AdaBoost", AdaBoostClassifier(algorithm="SAMME", random_state=42)],
    ["QDA", QuadraticDiscriminantAnalysis()],
    ["ElasticNet", ElasticNet(random_state=42)],
    ["SGD", SGDClassifier(loss='log_loss',max_iter=1000, tol=1e-3,random_state=42 )],
    ["GBC", GBC(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)],
    ["CatBoost", CatBoostClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)]
    ]

In [None]:
# Response Evaluation
display_response(ResponseModels)

Log Reg
Decision Tree
Random Forest
AdaBoost
QDA
ElasticNet
SGD
GBC
CatBoost
0:	learn: 0.3100980	total: 165ms	remaining: 16.3s
1:	learn: 0.3043577	total: 318ms	remaining: 15.6s
2:	learn: 0.3022392	total: 479ms	remaining: 15.5s
3:	learn: 0.2970018	total: 647ms	remaining: 15.5s
4:	learn: 0.2951715	total: 805ms	remaining: 15.3s
5:	learn: 0.2936079	total: 972ms	remaining: 15.2s
6:	learn: 0.2925332	total: 1.13s	remaining: 15s
7:	learn: 0.2917678	total: 1.28s	remaining: 14.7s
8:	learn: 0.2912945	total: 1.49s	remaining: 15.1s
9:	learn: 0.2904116	total: 1.75s	remaining: 15.7s
10:	learn: 0.2896765	total: 2.03s	remaining: 16.4s
11:	learn: 0.2893597	total: 2.34s	remaining: 17.1s
12:	learn: 0.2890343	total: 2.57s	remaining: 17.2s
13:	learn: 0.2887105	total: 2.85s	remaining: 17.5s
14:	learn: 0.2886001	total: 3.11s	remaining: 17.6s
15:	learn: 0.2881888	total: 3.39s	remaining: 17.8s
16:	learn: 0.2880397	total: 3.63s	remaining: 17.7s
17:	learn: 0.2879026	total: 3.91s	remaining: 17.8s
18:	learn: 0.2875

Unnamed: 0,model,accuracy,precision,recall,f1,roc auc
0,Log Reg,0.891853,0.855507,0.891853,0.873302,0.625863
1,Decision Tree,0.894527,0.865824,0.894527,0.879941,
2,Random Forest,0.891815,0.903519,0.891815,0.897629,
3,AdaBoost,0.894498,0.867995,0.894498,0.881047,0.781531
4,QDA,0.876788,0.835026,0.876788,0.855398,0.480435
5,ElasticNet,0.891863,0.852962,0.891863,0.871979,
6,SGD,0.874401,0.849752,0.874401,0.8619,0.590167
7,GBC,0.896773,0.871676,0.896773,0.884046,0.789076
8,CatBoost,0.896817,0.871749,0.896817,0.884105,
