In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, confusion_matrix, make_scorer, roc_auc_score, roc_curve, balanced_accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold


from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectFdr, SelectKBest, SelectPercentile


from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE as SMOTEup
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline as Pipeline_Imb

import xgboost as xgb

In [15]:
df_data = pd.read_csv('ArbuzTrain.csv')
df_test = pd.read_csv('ArbuzTest.csv')

In [17]:
df_data = df_data.drop(columns=[
    'client_id', 
    ])

df_test_drop = df_test.drop(columns=[
    'client_id', 
    ])


In [19]:
df_test_drop.isna().sum()

average_check                      0
number_orders                      0
average_score                    462
items_per_order                    0
is_friend                          0
last_check                         4
last_score                      1797
last_scored_after_order_days    1797
last_late_by_minutes               0
is_last_order_fullfilled           0
items_share_category_1             0
items_share_category_2             0
items_share_category_3             0
items_share_category_4             0
items_share_category_5             0
items_share_category_6             0
items_share_category_7             0
items_share_category_8             0
items_share_category_9             0
items_share_category_10            0
items_share_category_11            0
items_share_category_12            0
items_share_category_13            0
items_share_category_14            0
items_share_category_15            0
items_share_category_16            0
items_share_category_17            0
i

In [20]:
df_data.isna().sum()

average_check                      0
number_orders                      0
average_score                   1797
items_per_order                    0
is_friend                          0
last_check                         5
last_score                      7196
last_scored_after_order_days    7196
last_late_by_minutes               0
is_last_order_fullfilled           0
items_share_category_1             0
items_share_category_2             0
items_share_category_3             0
items_share_category_4             0
items_share_category_5             0
items_share_category_6             0
items_share_category_7             0
items_share_category_8             0
items_share_category_9             0
items_share_category_10            0
items_share_category_11            0
items_share_category_12            0
items_share_category_13            0
items_share_category_14            0
items_share_category_15            0
items_share_category_16            0
items_share_category_17            0
i

In [21]:
df_data.loc[df_data['last_late_by_minutes'] <= 5, 'last_late_by_minutes'] = 0
df_data.loc[df_data['last_late_by_minutes'] > 5, 'last_late_by_minutes'] = 1

df_data["last_score"] = df_data["last_score"].fillna(100)
df_data['last_score'] = pd.cut(df_data['last_score'], bins=[0, 5, 8, 10, 100], labels=[0, 1, 2, 3])
one_hot1 = pd.get_dummies(df_data['last_score'], prefix="last_score")
df_data = df_data.drop(columns=['last_score'])
df_data = df_data.join(one_hot1)


df_data["average_score"] = df_data["average_score"].fillna(100)
df_data['average_score'] = pd.cut(df_data['average_score'], bins=[0, 5, 8, 10, 100], labels=[0, 1, 2, 3])
one_hot2 = pd.get_dummies(df_data['average_score'], prefix="average_score")
df_data = df_data.drop(columns=['average_score'])
df_data = df_data.join(one_hot2)

df_data["last_scored_after_order_days"] = df_data["last_scored_after_order_days"].fillna(999)
df_data['last_scored_after_order_days'] = pd.cut(df_data['last_scored_after_order_days'], bins=[-0.1, 0.9, 7, 500, 999], labels=[0, 1, 2, 3])
one_hot3 = pd.get_dummies(df_data['last_scored_after_order_days'], prefix="last_scored_after_order_days")
df_data = df_data.drop(columns=['last_scored_after_order_days'])
df_data = df_data.join(one_hot3)

df_data["last_check"] = (df_data["average_check"] / df_data["number_orders"]).where(cond=df_data['last_check'].isna(), other=df_data['last_check'])

In [23]:
df_test_drop.loc[df_test_drop['last_late_by_minutes'] <= 5, 'last_late_by_minutes'] = 0
df_test_drop.loc[df_test_drop['last_late_by_minutes'] > 5, 'last_late_by_minutes'] = 1

df_test_drop["last_score"] = df_test_drop["last_score"].fillna(100)
df_test_drop['last_score'] = pd.cut(df_test_drop['last_score'], bins=[0, 5, 8, 10, 100], labels=[0, 1, 2, 3])
one_hot1 = pd.get_dummies(df_test_drop['last_score'], prefix="last_score")
df_test_drop = df_test_drop.drop(columns=['last_score'])
df_test_drop = df_test_drop.join(one_hot1)


df_test_drop["average_score"] = df_test_drop["average_score"].fillna(100)
df_test_drop['average_score'] = pd.cut(df_test_drop['average_score'], bins=[0, 5, 8, 10, 100], labels=[0, 1, 2, 3])
one_hot2 = pd.get_dummies(df_test_drop['average_score'], prefix="average_score")
df_test_drop = df_test_drop.drop(columns=['average_score'])
df_test_drop = df_test_drop.join(one_hot2)

df_test_drop["last_scored_after_order_days"] = df_test_drop["last_scored_after_order_days"].fillna(999)
df_test_drop['last_scored_after_order_days'] = pd.cut(df_test_drop['last_scored_after_order_days'], bins=[-0.1, 0.9, 7, 500, 999], labels=[0, 1, 2, 3])
one_hot3 = pd.get_dummies(df_test_drop['last_scored_after_order_days'], prefix="last_scored_after_order_days")
df_test_drop = df_test_drop.drop(columns=['last_scored_after_order_days'])
df_test_drop = df_test_drop.join(one_hot3)

df_test_drop["last_check"] = (df_test_drop["average_check"] / df_test_drop["number_orders"]).where(cond=df_test_drop['last_check'].isna(), other=df_test_drop['last_check'])

In [28]:
df_data.isna().sum().sum()

0

In [29]:
df_test_drop.isna().sum().sum()

0

In [30]:
df_data_x = df_data.drop(columns=['is_churn'])
df_data_y = df_data['is_churn'].values

In [31]:
X_train, X_test, y_train, y_test= train_test_split(df_data_x, df_data_y, random_state=42, test_size=0.2, stratify=df_data_y)

print('X_train_shape: ' + str(X_train.shape) + '\nX_val_shape: ' + str(X_test.shape)\
       + '\ny_train_shape: ' + str(y_train.shape) + '\ny_val_shape: ' + str(y_test.shape))

X_train_shape: (8787, 46)
X_val_shape: (2197, 46)
y_train_shape: (8787,)
y_val_shape: (2197,)


In [32]:
df_data_x

Unnamed: 0,average_check,number_orders,items_per_order,is_friend,last_check,last_late_by_minutes,is_last_order_fullfilled,items_share_category_1,items_share_category_2,items_share_category_3,...,last_score_2,last_score_3,average_score_0,average_score_1,average_score_2,average_score_3,last_scored_after_order_days_0,last_scored_after_order_days_1,last_scored_after_order_days_2,last_scored_after_order_days_3
0,49300,7,23.4286,0,9250.0,0,1,0.00,0.00,0.00,...,0,1,0,0,1,0,0,0,0,1
1,58900,7,5.4286,0,24547.0,0,1,0.00,0.00,0.00,...,0,1,0,0,1,0,0,0,0,1
2,43100,6,15.3333,0,6017.0,0,0,0.02,0.00,0.00,...,0,1,0,0,1,0,0,0,0,1
3,33900,8,9.6250,0,7316.0,0,1,0.00,0.00,0.00,...,0,1,1,0,0,0,0,0,0,1
4,37900,9,14.2222,0,2175.0,0,1,0.00,0.02,0.01,...,1,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10979,22600,7,7.0000,0,10924.0,0,1,0.00,0.00,0.00,...,1,0,0,0,1,0,1,0,0,0
10980,68600,18,18.3333,1,28262.0,0,0,0.00,0.00,0.00,...,0,0,0,0,1,0,0,1,0,0
10981,96900,7,25.1429,0,31532.0,1,1,0.00,0.00,0.00,...,1,0,0,0,1,0,0,1,0,0
10982,11400,18,1.8333,0,0.0,0,1,0.00,0.00,0.00,...,0,1,0,0,1,0,0,0,0,1


In [33]:
df_test_drop

Unnamed: 0,average_check,number_orders,items_per_order,is_friend,last_check,last_late_by_minutes,is_last_order_fullfilled,items_share_category_1,items_share_category_2,items_share_category_3,...,last_score_2,last_score_3,average_score_0,average_score_1,average_score_2,average_score_3,last_scored_after_order_days_0,last_scored_after_order_days_1,last_scored_after_order_days_2,last_scored_after_order_days_3
0,53500,9,18.5556,1,9590.0,0,1,0.05,0.00,0.00,...,0,1,0,0,1,0,0,0,0,1
1,41800,31,11.3548,0,4839.0,0,1,0.01,0.00,0.00,...,0,0,0,1,0,0,0,0,1,0
2,137900,12,31.7500,0,33861.0,0,1,0.00,0.01,0.01,...,1,0,0,0,1,0,0,1,0,0
3,26700,27,10.7407,0,8130.0,0,1,0.04,0.00,0.00,...,1,0,0,0,1,0,1,0,0,0
4,31900,16,8.8750,0,11428.0,0,1,0.01,0.01,0.00,...,1,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2742,43500,6,15.8333,0,12945.0,0,0,0.00,0.00,0.00,...,0,1,0,1,0,0,0,0,0,1
2743,17200,15,8.2667,0,3425.0,0,1,0.00,0.00,0.00,...,0,1,0,0,1,0,0,0,0,1
2744,47600,12,17.0000,0,11856.0,0,1,0.00,0.00,0.00,...,0,1,0,0,0,1,0,0,0,1
2745,35100,12,6.8333,0,8536.0,0,1,0.06,0.00,0.00,...,0,1,0,0,1,0,0,0,0,1


In [41]:
scaler = StandardScaler()

kselect = SelectPercentile()

downsample=SMOTETomek(tomek=TomekLinks(sampling_strategy="majority"), random_state=42)
upsample = SMOTEup(sampling_strategy="minority", random_state=42)

In [41]:

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
ada = AdaBoostClassifier(random_state=42)

pipe = Pipeline_Imb(steps=[
    ("scaler", scaler), 
    
    # ("downsample", downsample), 
    # ("upsample", upsample), 

    ('kperc', kselect),

    ("rf", rf), 
    # ("ada", ada), 
    # ("xgb_cls", xgb_model)
    
    ])


param_grid = {
    
    "kperc__percentile": [10, 30, 50, 70, 100],


    'rf__n_estimators': [10, 100, 1000, 10000], 
    'rf__max_features': ['sqrt', 'log2'], 
    'rf__max_depth': [2, 5, 10, 20, 50, 100],


    # 'ada__n_estimators': [10, 100, 1000], 
    # 'ada__learning_rate': [0.001, 0.01, 0.1, 1],

    # "xgb_cls__max_depth": [2, 5, 10, 20, 50, 100],
    # "xgb_cls__n_estimators": [10, 100, 1000, 10000],
    # "xgb_cls__max_features": ['auto', 'log2'],
}



skf = StratifiedKFold(n_splits=3)
auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)
search = GridSearchCV(pipe, param_grid, cv=skf, n_jobs=-1, refit=True, scoring='f1_macro', verbose=20)
# search = GridSearchCV(pipe, param_grid, cv=skf, n_jobs=-1, refit=True, scoring=auc, verbose=20)
search.fit(X_train, y_train)


Fitting 3 folds for each of 240 candidates, totalling 720 fits


In [537]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

y_pred_train = search.predict(X_train)
f1_train = f1_score(y_train, y_pred_train)

y_pred = search.predict(X_test)
f1_test = f1_score(y_test, y_pred, average='macro')
print(f'f1 on train is {f1_train}')
print(f'f1 on test is {f1_test}')

Best parameter (CV score=0.640):
{'kperc__percentile': 100, 'rf__max_depth': 10, 'rf__max_features': 'log2', 'rf__n_estimators': 100}
f1 on train is 0.6756107328794553
f1 on test is 0.6204438857277286


In [485]:
test_pred = search.predict(df_test_drop)

subm = pd.DataFrame({'client_id':df_test['client_id'], 'is_churn':test_pred})

subm.to_csv('subm6.csv', index=False)

In [56]:


rf = RandomForestClassifier(max_depth=10, n_estimators=100, max_features='log2', random_state=42, class_weight='balanced') # 100 perc no ups/dws
ada = AdaBoostClassifier(learning_rate=0.01, n_estimators=10000, random_state=42) # 30 perc
xgb_c = xgb.XGBClassifier(max_depth=5, n_estimators=5, max_features='auto', objective="binary:logistic", random_state=42) #10 perc




voting = VotingClassifier(estimators=[

    ('xgb', xgb_c),
    ('rf',rf),
    ('ada',ada),

    ], verbose=True)

# pipe_vote = Pipeline(steps=[
pipe_vote = Pipeline_Imb(steps=[
    ("scaler", scaler), 
    # ("downsample", downsample), 
    ("upsample", upsample), 

    ('kperc', kselect),


    ("voting_c", voting)
    
    ])

param_grid_vote = {
    "kperc__percentile": [10, 30, 50, 70, 100],

    'voting_c__voting': ['hard','soft'],

}



skf = StratifiedKFold(n_splits=3)
auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)
votes = GridSearchCV(pipe_vote, param_grid_vote, cv=skf, n_jobs=-1, refit=True, scoring='f1_macro', verbose=20)
votes.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Parameters: { "max_features" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[Voting] ...................... (1 of 3) Processing xgb, total=   0.0s
[Voting] ....................... (2 of 3) Processing rf, total=   0.8s
[Voting] ...................... (3 of 3) Processing ada, total=  55.4s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('upsample',
                                        SMOTE(random_state=42,
                                              sampling_strategy='minority')),
                                       ('kperc', SelectPercentile()),
                                       ('voting_c',
                                        VotingClassifier(estimators=[('xgb',
                                                                      XGBClassifier(base_score=None,
                                                                                    booster=None,
                                                                                    callbacks=None,
                                                                                    colsample_bylevel=No...
                                                     

In [57]:
print("Best parameter (CV score=%0.3f):" % votes.best_score_)
print(votes.best_params_)

y_pred_train = votes.predict(X_train)
f1_train = f1_score(y_train, y_pred_train, average='macro')

y_pred = votes.predict(X_test)
f1_test = f1_score(y_test, y_pred, average='macro')
print(f'f1 on train is {f1_train}')
print(f'f1 on test is {f1_test}')

Best parameter (CV score=0.634):
{'kperc__percentile': 10, 'voting_c__voting': 'soft'}
f1 on train is 0.6651441659993165
f1 on test is 0.6196361665403721


In [58]:
test_pred = votes.predict(df_test_drop)
subm = pd.DataFrame({'client_id':df_test['client_id'], 'is_churn':test_pred})

subm.to_csv('subm12.csv', index=False)