In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
from datetime import datetime

from helpers import *

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve, precision_score, jaccard_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, validation_curve, GridSearchCV
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)

In [2]:
train = pd.read_parquet("/Users/furkan/development/ML_DS/İş Bankası/train_final.parquet")
test = pd.read_parquet("/Users/furkan/development/ML_DS/İş Bankası/test_final.parquet")
df = train.copy()

## EDA

In [3]:
check_df(df)

##################### Shape #####################
(94049, 58)
##################### Types #####################
id              object
month            int64
n_seconds_1    float64
n_seconds_2    float64
n_seconds_3    float64
carrier         object
devicebrand     object
feature_0      float64
feature_1      float64
feature_2      float64
feature_3      float64
feature_4      float64
feature_5      float64
feature_6      float64
feature_7      float64
feature_8      float64
feature_9      float64
feature_10     float64
feature_11     float64
feature_12     float64
feature_13     float64
feature_14     float64
feature_15     float64
feature_16     float64
feature_17     float64
feature_18     float64
feature_19     float64
feature_20     float64
feature_21     float64
feature_22     float64
feature_23     float64
feature_24     float64
feature_25     float64
feature_26     float64
feature_27     float64
feature_28     float64
feature_29     float64
feature_30     float64
feature_31    

In [4]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 94049
Variables: 58
cat_cols: 1
num_cols: 53
cat_but_car: 4
num_but_cat: 1


In [5]:
def categorize_carrier(x):
    if x == "VODAFONE TR":
        return "VODAFONE TR"
    elif x == "TURKCELL":
        return "TURKCELL"
    elif x == "TURK TELEKOM":
        return "TURK TELEKOM"
    else:
        return "others"
    
df["carrier"] = df["carrier"].apply(categorize_carrier)

In [6]:
def categorize_device_brand(x):
    if x == "Apple":
        return "Apple"
    elif x == "samsung":
        return "samsung"
    elif x == "xiaomi":
        return "xiaomi"
    elif x == "HUAWEI":
        return "HUAWEI"
    elif x == "OPPO":
        return "OPPO"
    elif x == "Redmi":
        return "Redmi"
    else:
        return "others"
    
df["devicebrand"] = df["devicebrand"].apply(categorize_device_brand)

In [7]:
df.head()

Unnamed: 0,id,month,n_seconds_1,n_seconds_2,n_seconds_3,carrier,devicebrand,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
0,5beefd4d2bf4a4767e0df8108,10,5245.571,981.182,205.948,VODAFONE TR,Apple,-1.198,1.113,-1.123,-0.264,2.161,2.651,0.81,1.516,2.351,-4.608,0.678,-0.555,3.502,-0.765,1.543,-1.458,-1.017,-1.82,0.726,-1.444,1.087,2.102,-0.249,-0.583,-1.33,1.407,1.054,-1.532,1.667,0.251,-0.091,2.676,-0.621,0.163,1.72,-6.139,-0.303,2.784,-3.135,-3.794,0.93,-2.613,-2.033,2.646,-1.023,1.659,-1.559,-2.161,30,58,"menu2, menu4, menu5"
1,867285b116c063d5a8482f5be,10,5184.876,557.65,487.587,TURKCELL,samsung,-2.336,2.568,-0.495,0.949,3.568,3.358,0.434,0.886,1.452,-3.184,1.024,1.166,2.916,-1.281,3.019,-3.163,-1.12,-2.798,0.287,-2.029,0.26,0.962,-0.534,-1.311,-0.144,-1.562,1.931,-0.942,-0.0,-1.372,1.225,2.795,-0.961,0.114,1.048,-5.697,0.902,-0.073,-2.366,-0.6,0.815,-0.984,-1.454,-0.022,-0.196,2.776,-0.319,-4.291,21,45,"menu7, menu8, menu4"
2,c82a7cbd2e00d9b66c06bcadc,10,3835.618,3275.128,43.806,TURK TELEKOM,Redmi,-2.561,2.062,-0.185,1.062,4.198,1.551,-0.596,-0.619,2.017,-4.631,-0.663,-0.838,1.821,-2.171,3.695,-0.922,-2.873,-2.042,0.533,-0.128,0.45,2.716,-0.178,0.052,-0.518,-0.043,1.157,-0.527,-0.088,-0.497,1.203,2.742,-0.623,1.757,2.059,-5.359,0.929,1.133,-3.1,-1.253,1.057,-1.669,-3.599,1.674,0.632,1.293,-2.231,-2.384,19,61,"menu2, menu8, menu4"
3,f2d2b25073ccc298eced86897,10,3532.544,154.509,64.724,TURKCELL,samsung,-2.53,3.358,-0.851,1.644,2.849,3.887,1.855,0.988,2.066,-5.804,1.931,0.151,3.145,-2.673,0.378,-2.657,-0.378,-3.276,0.003,-0.674,-0.227,1.817,0.068,-0.787,-1.708,-2.0,1.77,0.049,0.23,-0.304,1.659,1.692,-1.105,1.242,2.043,-3.854,0.649,0.944,-3.027,-1.824,0.204,-1.861,-1.22,1.863,0.213,1.03,-1.142,-4.466,2,41,"menu6, menu2, menu1"
4,7818c92a58af0f2cb7c361738,10,3344.192,787.896,715.115,VODAFONE TR,samsung,-2.922,2.096,0.061,-1.488,3.225,2.092,-0.993,0.686,2.09,-2.974,-0.797,-1.459,2.398,-1.308,3.381,-2.664,-2.219,-1.714,1.288,-2.25,2.345,0.403,-0.077,0.289,1.847,1.424,1.888,-1.022,1.021,-0.843,-0.859,2.565,0.412,0.068,0.391,-5.996,2.675,0.203,-3.272,-1.418,2.188,-0.143,-1.876,1.024,-0.186,-0.062,-1.462,-2.371,23,85,"menu6, menu2, menu8"


In [8]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 94049
Variables: 58
cat_cols: 3
num_cols: 53
cat_but_car: 2
num_but_cat: 1


In [9]:
df = one_hot_encoder(df, cat_cols, drop_first=True)

In [10]:
df.head()

Unnamed: 0,id,n_seconds_1,n_seconds_2,n_seconds_3,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target,carrier_TURKCELL,carrier_VODAFONE TR,carrier_others,devicebrand_HUAWEI,devicebrand_OPPO,devicebrand_Redmi,devicebrand_others,devicebrand_samsung,devicebrand_xiaomi,month_11,month_12
0,5beefd4d2bf4a4767e0df8108,5245.571,981.182,205.948,-1.198,1.113,-1.123,-0.264,2.161,2.651,0.81,1.516,2.351,-4.608,0.678,-0.555,3.502,-0.765,1.543,-1.458,-1.017,-1.82,0.726,-1.444,1.087,2.102,-0.249,-0.583,-1.33,1.407,1.054,-1.532,1.667,0.251,-0.091,2.676,-0.621,0.163,1.72,-6.139,-0.303,2.784,-3.135,-3.794,0.93,-2.613,-2.033,2.646,-1.023,1.659,-1.559,-2.161,30,58,"menu2, menu4, menu5",False,True,False,False,False,False,False,False,False,False,False
1,867285b116c063d5a8482f5be,5184.876,557.65,487.587,-2.336,2.568,-0.495,0.949,3.568,3.358,0.434,0.886,1.452,-3.184,1.024,1.166,2.916,-1.281,3.019,-3.163,-1.12,-2.798,0.287,-2.029,0.26,0.962,-0.534,-1.311,-0.144,-1.562,1.931,-0.942,-0.0,-1.372,1.225,2.795,-0.961,0.114,1.048,-5.697,0.902,-0.073,-2.366,-0.6,0.815,-0.984,-1.454,-0.022,-0.196,2.776,-0.319,-4.291,21,45,"menu7, menu8, menu4",True,False,False,False,False,False,False,True,False,False,False
2,c82a7cbd2e00d9b66c06bcadc,3835.618,3275.128,43.806,-2.561,2.062,-0.185,1.062,4.198,1.551,-0.596,-0.619,2.017,-4.631,-0.663,-0.838,1.821,-2.171,3.695,-0.922,-2.873,-2.042,0.533,-0.128,0.45,2.716,-0.178,0.052,-0.518,-0.043,1.157,-0.527,-0.088,-0.497,1.203,2.742,-0.623,1.757,2.059,-5.359,0.929,1.133,-3.1,-1.253,1.057,-1.669,-3.599,1.674,0.632,1.293,-2.231,-2.384,19,61,"menu2, menu8, menu4",False,False,False,False,False,True,False,False,False,False,False
3,f2d2b25073ccc298eced86897,3532.544,154.509,64.724,-2.53,3.358,-0.851,1.644,2.849,3.887,1.855,0.988,2.066,-5.804,1.931,0.151,3.145,-2.673,0.378,-2.657,-0.378,-3.276,0.003,-0.674,-0.227,1.817,0.068,-0.787,-1.708,-2.0,1.77,0.049,0.23,-0.304,1.659,1.692,-1.105,1.242,2.043,-3.854,0.649,0.944,-3.027,-1.824,0.204,-1.861,-1.22,1.863,0.213,1.03,-1.142,-4.466,2,41,"menu6, menu2, menu1",True,False,False,False,False,False,False,True,False,False,False
4,7818c92a58af0f2cb7c361738,3344.192,787.896,715.115,-2.922,2.096,0.061,-1.488,3.225,2.092,-0.993,0.686,2.09,-2.974,-0.797,-1.459,2.398,-1.308,3.381,-2.664,-2.219,-1.714,1.288,-2.25,2.345,0.403,-0.077,0.289,1.847,1.424,1.888,-1.022,1.021,-0.843,-0.859,2.565,0.412,0.068,0.391,-5.996,2.675,0.203,-3.272,-1.418,2.188,-0.143,-1.876,1.024,-0.186,-0.062,-1.462,-2.371,23,85,"menu6, menu2, menu8",False,True,False,False,False,False,False,True,False,False,False


In [11]:
binary_cols = [col for col in df.columns if df[col].dtype not in [int, float] and df[col].nunique() == 2]
binary_cols

['carrier_TURKCELL',
 'carrier_VODAFONE TR',
 'carrier_others',
 'devicebrand_HUAWEI',
 'devicebrand_OPPO',
 'devicebrand_Redmi',
 'devicebrand_others',
 'devicebrand_samsung',
 'devicebrand_xiaomi',
 'month_11',
 'month_12']

In [12]:
for col in binary_cols:
    label_encoder(df, col)

In [13]:
df['menu'] = df['target'].str.extractall(r'(menu\d)').groupby(level=0).agg(list)

all_menus = sorted(set(menu for menus in df['menu'] for menu in menus))

df['target'] = df['menu'].apply(lambda x: ''.join(['1' if menu in x else '0' for menu in all_menus]))

df.drop(columns=['menu', 'month_11', 'month_12'], inplace=True)

In [14]:
df.head()

Unnamed: 0,id,n_seconds_1,n_seconds_2,n_seconds_3,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target,carrier_TURKCELL,carrier_VODAFONE TR,carrier_others,devicebrand_HUAWEI,devicebrand_OPPO,devicebrand_Redmi,devicebrand_others,devicebrand_samsung,devicebrand_xiaomi
0,5beefd4d2bf4a4767e0df8108,5245.571,981.182,205.948,-1.198,1.113,-1.123,-0.264,2.161,2.651,0.81,1.516,2.351,-4.608,0.678,-0.555,3.502,-0.765,1.543,-1.458,-1.017,-1.82,0.726,-1.444,1.087,2.102,-0.249,-0.583,-1.33,1.407,1.054,-1.532,1.667,0.251,-0.091,2.676,-0.621,0.163,1.72,-6.139,-0.303,2.784,-3.135,-3.794,0.93,-2.613,-2.033,2.646,-1.023,1.659,-1.559,-2.161,30,58,10110000,0,1,0,0,0,0,0,0,0
1,867285b116c063d5a8482f5be,5184.876,557.65,487.587,-2.336,2.568,-0.495,0.949,3.568,3.358,0.434,0.886,1.452,-3.184,1.024,1.166,2.916,-1.281,3.019,-3.163,-1.12,-2.798,0.287,-2.029,0.26,0.962,-0.534,-1.311,-0.144,-1.562,1.931,-0.942,-0.0,-1.372,1.225,2.795,-0.961,0.114,1.048,-5.697,0.902,-0.073,-2.366,-0.6,0.815,-0.984,-1.454,-0.022,-0.196,2.776,-0.319,-4.291,21,45,100110,1,0,0,0,0,0,0,1,0
2,c82a7cbd2e00d9b66c06bcadc,3835.618,3275.128,43.806,-2.561,2.062,-0.185,1.062,4.198,1.551,-0.596,-0.619,2.017,-4.631,-0.663,-0.838,1.821,-2.171,3.695,-0.922,-2.873,-2.042,0.533,-0.128,0.45,2.716,-0.178,0.052,-0.518,-0.043,1.157,-0.527,-0.088,-0.497,1.203,2.742,-0.623,1.757,2.059,-5.359,0.929,1.133,-3.1,-1.253,1.057,-1.669,-3.599,1.674,0.632,1.293,-2.231,-2.384,19,61,10100010,0,0,0,0,0,1,0,0,0
3,f2d2b25073ccc298eced86897,3532.544,154.509,64.724,-2.53,3.358,-0.851,1.644,2.849,3.887,1.855,0.988,2.066,-5.804,1.931,0.151,3.145,-2.673,0.378,-2.657,-0.378,-3.276,0.003,-0.674,-0.227,1.817,0.068,-0.787,-1.708,-2.0,1.77,0.049,0.23,-0.304,1.659,1.692,-1.105,1.242,2.043,-3.854,0.649,0.944,-3.027,-1.824,0.204,-1.861,-1.22,1.863,0.213,1.03,-1.142,-4.466,2,41,110001000,1,0,0,0,0,0,0,1,0
4,7818c92a58af0f2cb7c361738,3344.192,787.896,715.115,-2.922,2.096,0.061,-1.488,3.225,2.092,-0.993,0.686,2.09,-2.974,-0.797,-1.459,2.398,-1.308,3.381,-2.664,-2.219,-1.714,1.288,-2.25,2.345,0.403,-0.077,0.289,1.847,1.424,1.888,-1.022,1.021,-0.843,-0.859,2.565,0.412,0.068,0.391,-5.996,2.675,0.203,-3.272,-1.418,2.188,-0.143,-1.876,1.024,-0.186,-0.062,-1.462,-2.371,23,85,10001010,0,1,0,0,0,0,0,1,0


In [15]:
rs = RobustScaler()
df[num_cols] = rs.fit_transform(df[num_cols])

In [16]:
df.head()

Unnamed: 0,id,n_seconds_1,n_seconds_2,n_seconds_3,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target,carrier_TURKCELL,carrier_VODAFONE TR,carrier_others,devicebrand_HUAWEI,devicebrand_OPPO,devicebrand_Redmi,devicebrand_others,devicebrand_samsung,devicebrand_xiaomi
0,5beefd4d2bf4a4767e0df8108,13.891,5.59,1.999,0.706,-0.831,-1.009,0.058,-1.158,0.329,0.023,1.166,1.496,-0.747,1.049,0.071,0.634,0.142,-1.716,0.581,0.85,0.368,0.582,0.203,-0.488,0.679,0.362,-0.452,-1.865,1.14,-0.224,-0.14,1.008,1.308,-0.332,-0.41,-0.486,-0.643,1.38,-0.228,-1.582,2.073,-0.266,-2.121,-0.166,-1.389,-0.158,1.322,-1.244,0.857,-0.733,0.197,0.818,0.421,10110000,0,1,0,0,0,0,0,0,0
1,867285b116c063d5a8482f5be,13.72,2.759,6.313,-0.544,0.642,-0.447,1.004,0.055,1.021,-0.396,0.426,0.641,0.439,1.392,1.23,0.087,-0.4,-0.207,-1.198,0.746,-0.839,0.216,-0.325,-1.245,-0.095,0.109,-1.355,-0.745,-1.213,0.553,0.401,-0.662,-0.286,0.703,-0.287,-0.867,-0.697,0.816,0.357,-0.341,-0.656,0.532,0.502,-0.284,0.235,0.387,-0.87,-0.457,1.859,0.33,-2.327,0.0,-0.263,100110,1,0,0,0,0,0,0,1,0
2,c82a7cbd2e00d9b66c06bcadc,9.925,20.922,-0.484,-0.791,0.129,-0.17,1.093,0.599,-0.75,-1.547,-1.339,1.179,-0.767,-0.28,-0.119,-0.936,-1.335,0.484,1.141,-1.022,0.093,0.421,1.388,-1.071,1.096,0.425,0.335,-1.098,-0.009,-0.132,0.781,-0.75,0.574,0.685,-0.342,-0.488,1.12,1.665,0.804,-0.313,0.496,-0.229,-0.035,-0.037,-0.448,-1.632,0.523,0.33,0.529,-1.309,-0.066,-0.182,0.579,10100010,0,0,0,0,0,1,0,0,0
3,f2d2b25073ccc298eced86897,9.073,0.064,-0.164,-0.757,1.442,-0.766,1.547,-0.564,1.541,1.189,0.546,1.225,-1.744,2.291,0.547,0.301,-1.862,-2.907,-0.67,1.494,-1.427,-0.022,0.896,-1.691,0.486,0.644,-0.705,-2.221,-1.56,0.41,1.31,-0.432,0.764,1.044,-1.429,-1.029,0.551,1.652,2.794,-0.602,0.316,-0.153,-0.503,-0.909,-0.64,0.608,0.679,-0.068,0.293,-0.376,-2.534,-1.727,-0.474,110001000,1,0,0,0,0,0,0,1,0
4,7818c92a58af0f2cb7c361738,8.543,4.298,9.798,-1.188,0.164,0.049,-0.897,-0.24,-0.22,-1.989,0.192,1.247,0.614,-0.413,-0.538,-0.397,-0.429,0.164,-0.677,-0.363,0.498,1.051,-0.524,0.664,-0.474,0.515,0.629,1.133,1.153,0.516,0.327,0.361,0.234,-0.935,-0.525,0.675,-0.748,0.264,-0.039,1.487,-0.392,-0.408,-0.17,1.119,1.073,-0.01,-0.011,-0.448,-0.686,-0.65,-0.052,0.182,1.842,10001010,0,1,0,0,0,0,0,1,0


In [20]:
y = df["target"]
X = df.drop(["target", 'id'], axis=1)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20, random_state=17)

In [22]:
log_model = LogisticRegression().fit(X_train, y_train)

In [23]:
y_pred = log_model.predict(X_test)

In [24]:
y_prob = log_model.predict_proba(X_test)[:, 1]

In [29]:
jaccard_score(y_test, y_pred, average='micro')

0.12942447987030534