## Imports

In [1]:
import os

%matplotlib inline
import string
import sys
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

sys.path.append(os.path.join(os.path.abspath("."), "code"))

from plotting_functions import *
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from utils import *
import warnings
warnings.filterwarnings('ignore')

### Data

In [138]:
train_df_nan = pd.read_csv("data/AI_train.csv")
test_df_nan = pd.read_csv("data/test.csv")
# train_df_nan = train_df_nan.assign(Overall_Usage_Frequency=train_df_nan['Overall_Usage_Frequency']/30)
# test_df_nan = test_df_nan.assign(Overall_Usage_Frequency=test_df_nan['Overall_Usage_Frequency']/30)
train_df = train_df_nan
test_df = test_df_nan

In [139]:
numeric_features = ['Age']

drop_features = ['ID']
target_column = "Customer_Churn"

passthrough_features = ['AI_Response_Time', 'AI_Personalization_Effectiveness', 'AI_Interaction_Level', 'Satisfaction_with_AI_Services',
       'Change_in_Usage_Patterns']

categorical_features = ['Overall_Usage_Frequency', 'Customer_Service_Interactions']



In [140]:
numeric_transformer = StandardScaler()

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False),
)

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    ("passthrough", passthrough_features),
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)

In [141]:
preprocessor

In [142]:
X_train = train_df_nan.drop(columns=[target_column])
y_train = train_df_nan[target_column]

X_test = test_df_nan.drop(columns=[target_column])
y_test = test_df_nan[target_column]

# from imblearn.over_sampling import SMOTE

# Create an instance of SMOTE
# smote = SMOTE(random_state=42)

# Apply SMOTE to generate synthetic samples
# X_train, y_train = smote.fit_resample(X_train, y_train)


from imblearn.over_sampling import RandomOverSampler

# Assuming X_train and y_train are your feature and target matrices respectively
# X_train should contain the features of your training data
# y_train should contain the corresponding labels (0 or 1 for binary classification)

# Create an instance of RandomOverSampler with sampling strategy='minority'
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)

# Apply RandomOverSampler to generate synthetic samples only for the minority class
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [143]:
train_df_nan["Customer_Churn"].value_counts(normalize=True)

Customer_Churn
0    0.5818
1    0.4182
Name: proportion, dtype: float64

In [144]:
scoring_metric = "accuracy"

We are going to use models outside sklearn. Some of them cannot handle categorical target values. So we'll convert them to integers using `LabelEncoder`. 

In [145]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_num = label_encoder.fit_transform(y_train)
y_test_num = label_encoder.transform(y_test)

In [146]:
y_train_num

array([1, 0, 1, ..., 1, 1, 1])

Let's store all the results in a dictionary called `results`. 

In [147]:
results = {}

### Baselines

####  `DummyClassifier` baseline

In [148]:
dummy = DummyClassifier()
results["Dummy"] = mean_std_cross_val_scores(
    dummy, X_train, y_train_num, return_train_score=True, scoring=scoring_metric
)

#### `DecisionTreeClassifier` baseline

In [149]:
pipe_dt = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=123))
results["Decision tree"] = mean_std_cross_val_scores(
    pipe_dt, X_train, y_train_num, return_train_score=True, scoring=scoring_metric
)
pd.DataFrame(results).T
pipe_dt.fit(X_train, y_train)
feature_names = (
    numeric_features + passthrough_features + pipe_dt.named_steps["columntransformer"]
    .named_transformers_["pipeline"]
    .named_steps["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)



data = {
    "Importance": pipe_dt.named_steps["decisiontreeclassifier"].feature_importances_,
}
pd.DataFrame(data=data, index=feature_names,).sort_values(
    by="Importance", ascending=False
)

Unnamed: 0,Importance
Age,0.151661
Satisfaction_with_AI_Services,0.126631
AI_Personalization_Effectiveness,0.083444
Change_in_Usage_Patterns,0.056431
AI_Interaction_Level,0.049232
AI_Response_Time,0.046323
Customer_Service_Interactions_4,0.020211
Customer_Service_Interactions_0,0.018988
Customer_Service_Interactions_7,0.018405
Customer_Service_Interactions_9,0.018379


### `RandomForestClassifier` 

In [150]:
from sklearn.ensemble import RandomForestClassifier

pipe_rf = make_pipeline(
    preprocessor,
    RandomForestClassifier(
        n_jobs=-1,
        random_state=123,
        n_estimators=100
    ),
)
results["Random forests"] = mean_std_cross_val_scores(
    pipe_rf, X_train, y_train_num, return_train_score=True, scoring=scoring_metric
)
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Dummy,0.002 (+/- 0.000),0.001 (+/- 0.001),0.500 (+/- 0.000),0.500 (+/- 0.000)
Decision tree,0.041 (+/- 0.004),0.006 (+/- 0.001),0.659 (+/- 0.062),1.000 (+/- 0.000)
Random forests,0.322 (+/- 0.036),0.040 (+/- 0.006),0.690 (+/- 0.048),1.000 (+/- 0.000)


In [151]:
pipe_rf_demo = make_pipeline(
    preprocessor, RandomForestClassifier(max_depth=2, n_estimators=3, random_state=123)
)
pipe_rf_demo.fit(X_train, y_train_num);

pipe_rf_demo

In [152]:
feature_names = (
    numeric_features + passthrough_features + pipe_dt.named_steps["columntransformer"]
    .named_transformers_["pipeline"]
    .named_steps["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)
feature_names[:10]

['Age',
 'AI_Response_Time',
 'AI_Personalization_Effectiveness',
 'AI_Interaction_Level',
 'Satisfaction_with_AI_Services',
 'Change_in_Usage_Patterns',
 'Overall_Usage_Frequency_1',
 'Overall_Usage_Frequency_2',
 'Overall_Usage_Frequency_3',
 'Overall_Usage_Frequency_4']

In [153]:
test_example = X_test.iloc[[582]]
pipe_rf_demo.predict_proba(test_example)
print("Classes: ", pipe_rf_demo.classes_)
print("Prediction by random forest: ", pipe_rf_demo.predict(test_example))
transformed_example = preprocessor.transform(test_example)
pd.DataFrame(data=transformed_example.flatten(), index=feature_names)

Classes:  [0 1]
Prediction by random forest:  [0]


Unnamed: 0,0
Age,1.137215
AI_Response_Time,3.0
AI_Personalization_Effectiveness,7.0
AI_Interaction_Level,1.0
Satisfaction_with_AI_Services,7.0
Change_in_Usage_Patterns,-1.0
Overall_Usage_Frequency_1,0.0
Overall_Usage_Frequency_2,0.0
Overall_Usage_Frequency_3,0.0
Overall_Usage_Frequency_4,0.0


In [154]:
ratio = np.bincount(y_train_num)[0] / np.bincount(y_train_num)[1]
ratio

1.0

In [155]:
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

pipe_lr = make_pipeline(
    preprocessor, LogisticRegression(max_iter=2000, random_state=123)
)
pipe_dt = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=123))
pipe_rf = make_pipeline(
    preprocessor, RandomForestClassifier(class_weight="balanced", random_state=123, n_estimators=100)
)
pipe_xgb = make_pipeline(
    preprocessor,
    XGBClassifier(
        random_state=123, verbosity=0
    ),
)
pipe_lgbm = make_pipeline(
    preprocessor, LGBMClassifier(random_state=123, verbose=-1)
)

pipe_catboost = make_pipeline(
    preprocessor,
    CatBoostClassifier(verbose=0, random_state=123),
)

pipe_sklearn_histGB = make_pipeline(
    preprocessor,
    HistGradientBoostingClassifier(random_state=123),
)

pipe_sklearn_GB = make_pipeline(
    preprocessor,
    GradientBoostingClassifier(random_state=123),
)

classifiers = {
    "logistic regression": pipe_lr,
    "decision tree": pipe_dt,
    "random forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catboost,
    "sklearn_histGB": pipe_sklearn_histGB,
    "sklearn_GB": pipe_sklearn_GB,
}

In [156]:
import warnings

warnings.simplefilter(action="ignore", category=DeprecationWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [157]:
results = {}

In [158]:
dummy = DummyClassifier()
results["Dummy"] = mean_std_cross_val_scores(
    dummy, X_train, y_train, return_train_score=True, scoring=scoring_metric
)

In [159]:
for (name, model) in classifiers.items():
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train_num, return_train_score=True, scoring=scoring_metric
    )

In [160]:
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Dummy,0.003 (+/- 0.001),0.002 (+/- 0.001),0.500 (+/- 0.000),0.500 (+/- 0.000)
logistic regression,0.067 (+/- 0.022),0.006 (+/- 0.001),0.583 (+/- 0.007),0.595 (+/- 0.004)
decision tree,0.039 (+/- 0.002),0.006 (+/- 0.001),0.659 (+/- 0.062),1.000 (+/- 0.000)
random forest,0.562 (+/- 0.010),0.030 (+/- 0.001),0.693 (+/- 0.050),1.000 (+/- 0.000)
XGBoost,0.403 (+/- 0.059),0.009 (+/- 0.000),0.658 (+/- 0.048),0.930 (+/- 0.004)
LightGBM,0.063 (+/- 0.002),0.010 (+/- 0.000),0.645 (+/- 0.026),0.801 (+/- 0.009)
CatBoost,1.548 (+/- 0.050),0.022 (+/- 0.002),0.659 (+/- 0.032),0.839 (+/- 0.008)
sklearn_histGB,0.359 (+/- 0.015),0.010 (+/- 0.001),0.645 (+/- 0.031),0.797 (+/- 0.008)
sklearn_GB,0.530 (+/- 0.024),0.008 (+/- 0.000),0.633 (+/- 0.016),0.678 (+/- 0.003)


## Averaging 

In [97]:
classifiers.keys()

dict_keys(['logistic regression', 'decision tree', 'random forest', 'XGBoost', 'LightGBM', 'CatBoost', 'sklearn_histGB', 'sklearn_GB'])

In [161]:
# del classifiers["decision tree"]
del classifiers["logistic regression"]
# del classifiers["XGBoost"]

In [162]:
classifiers.keys()

dict_keys(['decision tree', 'random forest', 'XGBoost', 'LightGBM', 'CatBoost', 'sklearn_histGB', 'sklearn_GB'])

In [163]:
from sklearn.ensemble import VotingClassifier

averaging_model = VotingClassifier(
    list(classifiers.items()), voting="soft"
)  # need the list() here for cross-validation to work!

In [164]:
from sklearn import set_config

set_config(display="diagram")  # global setting

In [165]:
averaging_model

In [166]:
averaging_model.fit(X_train, y_train_num);

In [167]:
averaging_model.classes_

array([0, 1])

In [168]:
test_1 = (
    test_df.query("Customer_Churn == 1")
    .sample(4, random_state=42)
    .drop(columns=["Customer_Churn"])
)
test_0 = (
    test_df.query("Customer_Churn == 0")
    .sample(4, random_state=2)
    .drop(columns=["Customer_Churn"])
)

In [169]:
data = {"y": 1, "Voting classifier": averaging_model.predict(test_1)}
pd.DataFrame(data)

Unnamed: 0,y,Voting classifier
0,1,0
1,1,0
2,1,0
3,1,1


In [176]:
r1 = {
    name: classifier.predict(test_1)
    for name, classifier in averaging_model.named_estimators_.items()
}
data.update(r1)
pd.DataFrame(data)

Unnamed: 0,y,Voting classifier,decision tree,random forest,XGBoost,LightGBM,CatBoost,sklearn_histGB,sklearn_GB
0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1


In [177]:
r2 = {
    name: classifier.predict_proba(test_1)[:, 1]
    for name, classifier in averaging_model.named_estimators_.items()
}

data.update(r2)
pd.DataFrame(data)

Unnamed: 0,y,Voting classifier,decision tree,random forest,XGBoost,LightGBM,CatBoost,sklearn_histGB,sklearn_GB
0,1,0,0.0,0.31,0.120289,0.314825,0.34855,0.351209,0.266905
1,1,0,0.0,0.43,0.295145,0.225388,0.268481,0.228505,0.436212
2,1,0,0.0,0.29,0.099194,0.272523,0.260185,0.247643,0.247792
3,1,1,1.0,0.58,0.673099,0.597204,0.607041,0.587166,0.545149


In [178]:
averaging_model.predict_proba(test_1)[1]

array([0.73089559, 0.26910441])

In [179]:
# Sum of probabilities for class 0 at index 1
sum_prob_ex1_class_0 = np.sum(
    [
        classifier.predict_proba(test_1)[1][0]
        for name, classifier in averaging_model.named_estimators_.items()
    ]
)
sum_prob_ex1_class_0

5.116269157303609

In [180]:
# Sum of probabilities for class 1 at index 1
sum_prob_ex1_class_1 = np.sum(
    [
        classifier.predict_proba(test_1)[1][1]
        for name, classifier in averaging_model.named_estimators_.items()
    ]
)
sum_prob_ex1_class_1

1.8837308426963915

In [181]:
n_constituents = len(averaging_model.named_estimators_)
n_constituents

7

In [182]:
sum_prob_ex1_class_0 / n_constituents, sum_prob_ex1_class_1 / n_constituents

(0.7308955939005156, 0.2691044060994845)

In [183]:
averaging_model.predict_proba(test_1)[1]

array([0.73089559, 0.26910441])

In [184]:
averaging_model.predict_proba(test_1)[2]

array([0.79752323, 0.20247677])

In [185]:
results["Voting"] = mean_std_cross_val_scores(
    averaging_model, X_train, y_train, return_train_score=True, scoring=scoring_metric
)

In [186]:
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Dummy,0.003 (+/- 0.001),0.002 (+/- 0.001),0.500 (+/- 0.000),0.500 (+/- 0.000)
logistic regression,0.067 (+/- 0.022),0.006 (+/- 0.001),0.583 (+/- 0.007),0.595 (+/- 0.004)
decision tree,0.039 (+/- 0.002),0.006 (+/- 0.001),0.659 (+/- 0.062),1.000 (+/- 0.000)
random forest,0.562 (+/- 0.010),0.030 (+/- 0.001),0.693 (+/- 0.050),1.000 (+/- 0.000)
XGBoost,0.403 (+/- 0.059),0.009 (+/- 0.000),0.658 (+/- 0.048),0.930 (+/- 0.004)
LightGBM,0.063 (+/- 0.002),0.010 (+/- 0.000),0.645 (+/- 0.026),0.801 (+/- 0.009)
CatBoost,1.548 (+/- 0.050),0.022 (+/- 0.002),0.659 (+/- 0.032),0.839 (+/- 0.008)
sklearn_histGB,0.359 (+/- 0.015),0.010 (+/- 0.001),0.645 (+/- 0.031),0.797 (+/- 0.008)
sklearn_GB,0.530 (+/- 0.024),0.008 (+/- 0.000),0.633 (+/- 0.016),0.678 (+/- 0.003)
Voting,3.809 (+/- 0.224),0.088 (+/- 0.002),0.675 (+/- 0.048),0.979 (+/- 0.002)


In [187]:
classifiers_ndt = classifiers.copy()
# del classifiers_ndt["sklearn_GB"]

averaging_model_ndt = VotingClassifier(
    list(classifiers_ndt.items()), voting="soft"
)  # need the list() here for cross_val to work!

results["Voting_ndt"] = mean_std_cross_val_scores(
    averaging_model_ndt,
    X_train,
    y_train,
    return_train_score=True,
    scoring=scoring_metric,
)

In [188]:
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Dummy,0.003 (+/- 0.001),0.002 (+/- 0.001),0.500 (+/- 0.000),0.500 (+/- 0.000)
logistic regression,0.067 (+/- 0.022),0.006 (+/- 0.001),0.583 (+/- 0.007),0.595 (+/- 0.004)
decision tree,0.039 (+/- 0.002),0.006 (+/- 0.001),0.659 (+/- 0.062),1.000 (+/- 0.000)
random forest,0.562 (+/- 0.010),0.030 (+/- 0.001),0.693 (+/- 0.050),1.000 (+/- 0.000)
XGBoost,0.403 (+/- 0.059),0.009 (+/- 0.000),0.658 (+/- 0.048),0.930 (+/- 0.004)
LightGBM,0.063 (+/- 0.002),0.010 (+/- 0.000),0.645 (+/- 0.026),0.801 (+/- 0.009)
CatBoost,1.548 (+/- 0.050),0.022 (+/- 0.002),0.659 (+/- 0.032),0.839 (+/- 0.008)
sklearn_histGB,0.359 (+/- 0.015),0.010 (+/- 0.001),0.645 (+/- 0.031),0.797 (+/- 0.008)
sklearn_GB,0.530 (+/- 0.024),0.008 (+/- 0.000),0.633 (+/- 0.016),0.678 (+/- 0.003)
Voting,3.809 (+/- 0.224),0.088 (+/- 0.002),0.675 (+/- 0.048),0.979 (+/- 0.002)


In [189]:
from sklearn.ensemble import StackingClassifier

In [190]:
stacking_model = StackingClassifier(list(classifiers.items()))

In [191]:
stacking_model.fit(X_train, y_train);

In [192]:
valid_sample_df = train_df.sample(10, random_state=12)
valid_sample_X = valid_sample_df.drop(columns=["Customer_Churn"])
valid_sample_y = valid_sample_df['Customer_Churn']

In [193]:
pd.DataFrame(
    data=stacking_model.final_estimator_.coef_.flatten(),
    index=classifiers.keys(),
    columns=["Coefficient"],
).sort_values("Coefficient", ascending=False)

Unnamed: 0,Coefficient
random forest,6.943723
sklearn_GB,0.496538
decision tree,0.112022
LightGBM,-0.070699
XGBoost,-0.466336
CatBoost,-0.563412
sklearn_histGB,-1.399237


In [194]:
stacking_model.final_estimator_.intercept_

array([-2.79909722])

In [195]:
stacking_model.predict(test_0)

array([0, 0, 0, 0])

In [196]:
stacking_model.predict_proba(test_1)

array([[0.78247854, 0.21752146],
       [0.55509291, 0.44490709],
       [0.77224076, 0.22775924],
       [0.47720071, 0.52279929]])

In [197]:
results["Stacking"] = mean_std_cross_val_scores(
    stacking_model, X_train, y_train, return_train_score=True, scoring=scoring_metric
)

In [198]:
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Dummy,0.003 (+/- 0.001),0.002 (+/- 0.001),0.500 (+/- 0.000),0.500 (+/- 0.000)
logistic regression,0.067 (+/- 0.022),0.006 (+/- 0.001),0.583 (+/- 0.007),0.595 (+/- 0.004)
decision tree,0.039 (+/- 0.002),0.006 (+/- 0.001),0.659 (+/- 0.062),1.000 (+/- 0.000)
random forest,0.562 (+/- 0.010),0.030 (+/- 0.001),0.693 (+/- 0.050),1.000 (+/- 0.000)
XGBoost,0.403 (+/- 0.059),0.009 (+/- 0.000),0.658 (+/- 0.048),0.930 (+/- 0.004)
LightGBM,0.063 (+/- 0.002),0.010 (+/- 0.000),0.645 (+/- 0.026),0.801 (+/- 0.009)
CatBoost,1.548 (+/- 0.050),0.022 (+/- 0.002),0.659 (+/- 0.032),0.839 (+/- 0.008)
sklearn_histGB,0.359 (+/- 0.015),0.010 (+/- 0.001),0.645 (+/- 0.031),0.797 (+/- 0.008)
sklearn_GB,0.530 (+/- 0.024),0.008 (+/- 0.000),0.633 (+/- 0.016),0.678 (+/- 0.003)
Voting,3.809 (+/- 0.224),0.088 (+/- 0.002),0.675 (+/- 0.048),0.979 (+/- 0.002)


In [199]:
# stacking_model_tree = StackingClassifier(
#     list(classifiers.items()), final_estimator=DecisionTreeClassifier(max_depth=3)
# )

Because LightGBM performing on the best we will use this model on the test data.

In [200]:
models = [pipe_rf, pipe_catboost, pipe_dt, pipe_lgbm, pipe_sklearn_histGB, pipe_sklearn_GB, pipe_xgb, averaging_model, averaging_model_ndt, stacking_model]


for model in models:
    model.fit(X_train, y_train)
    print(model.score(test_df.drop(columns = "Customer_Churn"), test_df["Customer_Churn"]))
    

# pipe_rf.fit(X_train, y_train)

# print("Accuracy:") 

# print(pipe_rf.score(test_df.drop(columns = "Customer_Churn"), test_df["Customer_Churn"]))

0.6784
0.6264
0.5752
0.6008
0.6152
0.5928
0.6432
0.64
0.64
0.6952


In [201]:
submission = pd.DataFrame(stacking_model.predict(test_df.drop(columns = "Customer_Churn"))).rename_axis('id').rename(columns={0: 'Prediction'})
submission.index += 1
display(submission)

pd.DataFrame(submission).to_csv("submission.csv")

Unnamed: 0_level_0,Prediction
id,Unnamed: 1_level_1
1,0
2,0
3,0
4,1
5,0
...,...
1246,1
1247,1
1248,0
1249,0


Unnamed: 0_level_0,Prediction
id,Unnamed: 1_level_1
2,0
3,0
5,0
6,0
7,0
...,...
1242,0
1245,0
1248,0
1249,0
