In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import optuna
from sklearn.metrics import f1_score,accuracy_score,recall_score,roc_auc_score,precision_score
from catboost import Pool, CatBoostClassifier
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from imblearn.over_sampling import SMOTE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier


from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,classification_report,f1_score

In [8]:
train = pd.read_csv("../input/avjune/train.csv")
test = pd.read_csv("../input/avjune/test_Wf7sxXF.csv")
train.head()

### Converting the columns into datetime

In [9]:
train['created_at']= pd.to_datetime(train['created_at'])
train['signup_date']= pd.to_datetime(train['signup_date'])
train.dtypes

### Extracting the datetime features from datetime columns

In [10]:
train["c_day"] = train["created_at"].dt.day
train["c_month"] = train["created_at"].dt.month
train["c_year"] = train["created_at"].dt.year
train["s_day"] = train["signup_date"].dt.day
train["s_month"] = train["signup_date"].dt.month
train["s_year"] = train["signup_date"].dt.year
train = train.fillna(0)
train.head()

### Keeping only the required features

In [13]:
feat_cols = ['campaign_var_1', 'campaign_var_2',
       'products_purchased', 'user_activity_var_1',
       'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4',
       'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
       'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10',
       'user_activity_var_11', 'user_activity_var_12', 'c_day', 'c_month', 'c_year', 's_day', 's_month', 's_year']
df = train[feat_cols]
df.head()

In [14]:
train['buy'].value_counts(normalize=True)

In [15]:
sns.catplot(x = "buy", kind = "count", data = train)

### Preprocessing test data

In [None]:
test_ids = test.id.tolist()
test['created_at']= pd.to_datetime(test['created_at'])
test['signup_date']= pd.to_datetime(test['signup_date'])
test["c_day"] = test["created_at"].dt.day
test["c_month"] = test["created_at"].dt.month
test["c_year"] = test["created_at"].dt.year
test["s_day"] = test["signup_date"].dt.day
test["s_month"] = test["signup_date"].dt.month
test["s_year"] = test["signup_date"].dt.year
test = test.fillna(0)
test = test.drop(['id', 'created_at','signup_date'], axis = 1)
test.head()

#### We can clearly say that the classes are imbalanced

### Data Splitting into train and valid

In [16]:
X = df
y = train['buy']

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.02, random_state = 42)

In [None]:
preds = xgb_model.predict(test)
sub = pd.read_csv('../input/jobathon-june-22/sample_submission_2zvVjBu.csv')
sub['buy'] = preds
sub.to_csv('xgb_max_first.csv',index = False)

## Model Selection and Training

### a. Baseline Model

In [18]:
f1 =[]
model_names =[]

model = DummyClassifier(strategy='constant', constant=1)
pipe = make_pipeline( model)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
# print(f1_score(y_test, y_pred))
f1.append(round(f1_score(y_test, y_pred),4))
print (f'model : {model} and  f1 score is : {round(f1_score(y_test, y_pred),4)}')

model_names = ['DummyClassifier']
dummy_result_df = pd.DataFrame({'F1 Score':f1}, index=model_names)
dummy_result_df

### b. Logistic Regression and Linear DiscriminantAnalysis

In [19]:
f1 =[]
model_names =[]

lr = LogisticRegression(solver='liblinear')
lda= LinearDiscriminantAnalysis()

models = [lr,lda]

for model in models: 
    pipe = make_pipeline(model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    f1.append(round(f1_score(y_test, y_pred),4))
    print (f'model : {model} and  F1 score is : {round(f1_score(y_test, y_pred),4)}')


model_names = ['Logistic','LinearDiscriminant']
result_df = pd.DataFrame({'F1 Score':f1}, index=model_names)
result_df

### c. AdaBoost, GBClassifier, RandomForest and ExtraTreeClassifier

In [20]:
f1 =[]
model_names =[]

ada = AdaBoostClassifier(random_state=0)
gb = GradientBoostingClassifier(random_state=0)
rf = RandomForestClassifier(random_state=0)
et=  ExtraTreesClassifier(random_state=0)

models = [ada,gb,rf,et]

for model in models: 
    pipe = make_pipeline(model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    f1.append(round(f1_score(y_test, y_pred),4))
    print (f'model : {model} and  F1 score is : {round(f1_score(y_test, y_pred),4)}')


model_names = ['Ada','Gradient','Random','ExtraTree']
result_df_1 = pd.DataFrame({'F1 Score':f1}, index=model_names)
result_df_1

### d. XGB, LGBM and CATBOOST

In [21]:
f1 =[]
model_names =[]

xgbc = XGBClassifier(random_state=0)
lgbmc=LGBMClassifier(random_state=0)
cboost = CatBoostClassifier(verbose=False,random_state=0)

models = [xgbc,lgbmc,cboost]

for model in models: 
    pipe = make_pipeline(model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    f1.append(round(f1_score(y_test, y_pred),4))
    print (f'model : {model} and  F1 score is : {round(f1_score(y_test, y_pred),4)}')


model_names = ['XGBoost','LightGBM','CatBoost']
result_df_1 = pd.DataFrame({'F1 Score':f1}, index=model_names)
result_df_1

### XGBOOST (GPU) with Optuna

In [22]:
def run(trial):
    
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = X_train
    xvalid = X_test

    ytrain = y_train
    yvalid = y_test


    model = XGBClassifier(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    f1 = f1_score(yvalid, preds_valid)
    return f1

In [23]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=50)

In [24]:
study.best_params

In [25]:
xgb_model = XGBClassifier(random_state=42,
                          tree_method="gpu_hist",
                          gpu_id=0,
                          predictor="gpu_predictor",
                          n_estimators=7000,
                          learning_rate = 0.06151255452467186,
                          reg_lambda= 0.004432996186543039,
                          reg_alpha= 0.006413325449779519,
                          subsample= 0.3042296834861047,
                          colsample_bytree= 0.5983540700738519,
                          max_depth= 7) 
xgb_model.fit(X_train,y_train)

In [26]:
y_pred = xgb_model.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(y_test, y_pred))

### CatBoost with Optuna

In [31]:
def objective(trial):
    

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    cat_cls = CatBoostClassifier(**param)

    cat_cls.fit(X_train, y_train, eval_set=[(X_test, y_test)],verbose=0, early_stopping_rounds=100)

    preds = cat_cls.predict(X_test)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_test, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [32]:
# model = CatBoostClassifier(verbose=False,random_state=0,
#                           objective= 'Logloss',
#     colsample_bylevel= 0.08201062838206506,
#     depth= 12,
#     boosting_type= 'Ordered',
#     bootstrap_type= 'MVS')
# model.fit(X_train, y_train,eval_set=(X_test, y_test))
# y_pred = model.predict(X_test)
# print(classification_report(y_test, y_pred))

model = CatBoostClassifier(verbose=False,random_state=0,
                          objective= 'Logloss',
    colsample_bylevel= 0.0694666313113051,
    depth= 6,
    boosting_type= 'Plain',
    bootstrap_type= 'Bayesian')
model.fit(X_train, y_train,eval_set=(X_test, y_test))
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


## BEST PERFORMING TILL NOW

In [34]:
preds = model.predict(test)
sub = pd.read_csv('../input/avjune/sample_submission_2zvVjBu.csv')
sub['buy'] = preds
sub.to_csv('cboost_tuned.csv',index = False)

## scale_pos_weight parameter for handling imbalanced dataset

#### scale_pos_weight is the value that we get when we divide no. of majority classes by no. of minority classes

In [36]:
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE

In [35]:
spw = int(train['buy'].value_counts()[0]/train['buy'].value_counts()[1]) # It is coming around 18 in this case
spw

* There is a problem by having such a high value as scale_pos_weight
* It encourages overfitting
* higher penalty to the minority classes, hence giving us the wrong results

### Default Catboost w/o scale_pos_weight

In [37]:
accuracy= []
recall =[]
roc_auc= []
precision = []

model_names =[]
catboost_base = CatBoostClassifier(verbose=False,random_state=0)

catboost_base.fit(X_train, y_train,eval_set=(X_test, y_test))
y_pred = catboost_base.predict(X_test)

accuracy.append(round(accuracy_score(y_test, y_pred),4))
recall.append(round(recall_score(y_test, y_pred),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))
precision.append(round(precision_score(y_test, y_pred),4))

model_names = ['Catboost_default']
result_df1 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df1

### catboost with scale_pos_weight

In [38]:
accuracy= []
recall =[]
roc_auc= []
precision = []

model_names =[]
catboost_base = CatBoostClassifier(verbose=False,random_state=0,scale_pos_weight=spw)

catboost_base.fit(X_train, y_train,eval_set=(X_test, y_test))
y_pred = catboost_base.predict(X_test)

accuracy.append(round(accuracy_score(y_test, y_pred),4))
recall.append(round(recall_score(y_test, y_pred),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))
precision.append(round(precision_score(y_test, y_pred),4))

model_names = ['Catboost_default']
result_df2 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df2

### LGBM w/o scale_pos_weight

In [39]:
accuracy= []
recall =[]
roc_auc= []
precision = []

model_names =[]
lgbmc_base=LGBMClassifier(random_state=0)

lgbmc_base.fit(X_train, y_train,eval_set=(X_test, y_test))
y_pred = lgbmc_base.predict(X_test)

accuracy.append(round(accuracy_score(y_test, y_pred),4))
recall.append(round(recall_score(y_test, y_pred),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))
precision.append(round(precision_score(y_test, y_pred),4))

model_names = ['LightGBM_default']
result_df3 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df3

### LGBM with scale_pos_weight

In [40]:
accuracy= []
recall =[]
roc_auc= []
precision = []

model_names =[]
lgbmc_base=LGBMClassifier(random_state=0,scale_pos_weight=spw)

lgbmc_base.fit(X_train, y_train,eval_set=(X_test, y_test))
y_pred = lgbmc_base.predict(X_test)

accuracy.append(round(accuracy_score(y_test, y_pred),4))
recall.append(round(recall_score(y_test, y_pred),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))
precision.append(round(precision_score(y_test, y_pred),4))

model_names = ['LightGBM_default']
result_df4 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
result_df4

* We see that the recall and precision w/o scale_pos_weight is balanced
* But when we introduce scale_pos_weight to the picture there is a improvement in recall but precision goes down drastically

In [41]:
preds = model.predict(test)
sub = pd.read_csv('../input/avjune/sample_submission_2zvVjBu.csv')
sub['buy'] = preds
sub.to_csv('cboost_tuned.csv',index = False)

In [42]:
import shap
explainer = shap.Explainer(model)
shap_values = explainer(X)

In [43]:
shap.plots.beeswarm(shap_values)