# best kaggle scores:
### 86.6% AUC - part 1, random forest and part 4, gradient boosting
### 86.1% AUC - part 3, logistic regression with L1 regularization
### 85.4% AUC - part 1, gradient boosting

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt       
%matplotlib inline 
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

In [2]:
train=pd.read_csv('train.csv')
train.shape

(32769, 10)

In [3]:
test=pd.read_csv('test.csv')
test.shape

(58921, 10)

In [4]:
train.rename(columns=str.lower,inplace=True)
test.rename(columns=str.lower,inplace=True)

In [5]:
train.head()

Unnamed: 0,action,resource,mgr_id,role_rollup_1,role_rollup_2,role_deptname,role_title,role_family_desc,role_family,role_code
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


In [6]:
train[train.duplicated()]
#no duplicate rows in training set

Unnamed: 0,action,resource,mgr_id,role_rollup_1,role_rollup_2,role_deptname,role_title,role_family_desc,role_family,role_code


In [7]:
train.info()
#all integer columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32769 entries, 0 to 32768
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   action            32769 non-null  int64
 1   resource          32769 non-null  int64
 2   mgr_id            32769 non-null  int64
 3   role_rollup_1     32769 non-null  int64
 4   role_rollup_2     32769 non-null  int64
 5   role_deptname     32769 non-null  int64
 6   role_title        32769 non-null  int64
 7   role_family_desc  32769 non-null  int64
 8   role_family       32769 non-null  int64
 9   role_code         32769 non-null  int64
dtypes: int64(10)
memory usage: 2.5 MB


In [8]:
train.isnull().sum()
#no null values

action              0
resource            0
mgr_id              0
role_rollup_1       0
role_rollup_2       0
role_deptname       0
role_title          0
role_family_desc    0
role_family         0
role_code           0
dtype: int64

**highly imbalanced dataset**

In [9]:
print(train["action"].value_counts())
print(train["action"].value_counts(normalize=True))

1    30872
0     1897
Name: action, dtype: int64
1    0.94211
0    0.05789
Name: action, dtype: float64


**we can see all the features are categorical with high cardinality**

In [10]:
for col in train.columns[1:]:
    print(col+" has "+str(train[col].nunique())+" unique labels")

resource has 7518 unique labels
mgr_id has 4243 unique labels
role_rollup_1 has 128 unique labels
role_rollup_2 has 177 unique labels
role_deptname has 449 unique labels
role_title has 343 unique labels
role_family_desc has 2358 unique labels
role_family has 67 unique labels
role_code has 343 unique labels


In [11]:
for col in test.columns[1:]:
    print(col+" has "+str(test[col].nunique())+" unique labels")

resource has 4971 unique labels
mgr_id has 4689 unique labels
role_rollup_1 has 126 unique labels
role_rollup_2 has 177 unique labels
role_deptname has 466 unique labels
role_title has 351 unique labels
role_family_desc has 2749 unique labels
role_family has 68 unique labels
role_code has 351 unique labels


**Another thing to notice is that in most of the categorical features we have labels that are present in the testing set but not in the training set. In other words there will be labels (for every feature individually) that we will encounter in test set but not in train set, as a result we will not be able to train for them.<br>Lets see how many such labels are there:**

In [12]:
for col in train.columns[1:]:
    train_unique_labels=set(train[col])
    test_unique_labels=set(test[col])
    print("feature: "+col+" | no. of labels present in test set only: "+str(len(test_unique_labels-train_unique_labels)))

feature: resource | no. of labels present in test set only: 0
feature: mgr_id | no. of labels present in test set only: 670
feature: role_rollup_1 | no. of labels present in test set only: 2
feature: role_rollup_2 | no. of labels present in test set only: 6
feature: role_deptname | no. of labels present in test set only: 27
feature: role_title | no. of labels present in test set only: 18
feature: role_family_desc | no. of labels present in test set only: 593
feature: role_family | no. of labels present in test set only: 1
feature: role_code | no. of labels present in test set only: 18


**except "resource" all other features have label bias.**<br><br>

# PART 1
To combat label bias, all the labels(for each feature individually) that are newly encountered in test set will be made 0 representing "unknown".
Moreover, we can see the cardinality of the features is very high hence one-hot encoding the features would create a lot of dimensionality. It will expand the feature set to 15,617 features. To combat this we will find the normalized value counts of each feature and replace the label with their respective proportion in that feature. Eg: our training set is of size 32769, if a categorical feature has labels A,B,C, where frequency/count of A=30000, B=2000, C=769, then these value will be replaced by A=30000/32769=0.915, B=2000/32769=0.061 and C=0.023.<br>This methodology seems apt to convert categorical labels to numeric values, however it comes at an expense of information loss. There will be some labels that have same no. of counts, we will end up replacing those labels with the same numeric value. Eg: A=30000, B=1000, C=769, D=1000. In such a case we will end up labelling B and D with same proportion i.e 0.030, as a result, B and D will be treated as same label for that feature causing information loss in that feature.

In [6]:
for col in train.columns[1:]:
    train[col+"_new"] = train[col]
    train[col+"_new"].replace(train[col].value_counts(normalize=True).to_dict(),inplace=True)
    
    test[col+"_new"] = test[col]
    test[col+"_new"].replace(train[col].value_counts(normalize=True).to_dict(),inplace=True)

In [7]:
for col in train.columns[10:]:
    print(train[col].value_counts())

0.000031    3766
0.000061    2788
0.000092    2280
0.000122    1632
0.000153    1240
            ... 
0.001587      52
0.001556      51
0.001495      49
0.001434      47
0.001404      46
Name: resource_new, Length: 101, dtype: int64
0.000122    1460
0.000092    1356
0.000153    1335
0.000183    1260
0.000244    1216
            ... 
0.002167      71
0.002045      67
0.002014      66
0.001831      60
0.001465      48
Name: mgr_id_new, Length: 68, dtype: int64
0.653270    21407
0.022643      742
0.022003      721
0.015197      498
0.012207      400
            ...  
0.000031       10
0.000275        9
0.000244        8
0.000061        4
0.000092        3
Name: role_rollup_1_new, Length: 79, dtype: int64
0.135006    4424
0.120388    3945
0.080594    2641
0.077726    2547
0.054808    1796
            ... 
0.000397      13
0.000122      12
0.000092      12
0.000336      11
0.000061       6
Name: role_rollup_2_new, Length: 105, dtype: int64
0.034636    1135
0.023284     763
0.020110     659


**As mentioned above we will replace all the newly encountered values in test set to 0 representing "unknown". We have encoded the features as explained above as a result all the values are between 0 and 1. Therefore, in the test set, any label greater than 1 will be the label that was not found in the training set and hence was not converted.**

In [8]:
for col in test.columns[10:]:
    test[col][test[col]>1] = 0.0

In [9]:
for col in train.columns[1:]:
    print(col+" has "+str(train[col].nunique())+" unique labels")

resource has 7518 unique labels
mgr_id has 4243 unique labels
role_rollup_1 has 128 unique labels
role_rollup_2 has 177 unique labels
role_deptname has 449 unique labels
role_title has 343 unique labels
role_family_desc has 2358 unique labels
role_family has 67 unique labels
role_code has 343 unique labels
resource_new has 101 unique labels
mgr_id_new has 68 unique labels
role_rollup_1_new has 79 unique labels
role_rollup_2_new has 105 unique labels
role_deptname_new has 159 unique labels
role_title_new has 126 unique labels
role_family_desc_new has 108 unique labels
role_family_new has 54 unique labels
role_code_new has 126 unique labels


**If you compare every feature with its "new" alternative you will see that the unique labels are reduced a lot. eg: resource having 7518 unique labels was reduced to 101 unique labels only. This shows that the count of a lot of labels was same as a result we ended up labelling them with the same numeric value, thereby grouping them together. Hence this is a case of information loss but lets see the model performance:**

In [10]:
x = train[train.columns[10:]]
y = train["action"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100, stratify=y)

f_test = test[test.columns[10:]]

**Logistic Regression**

In [69]:
model = LogisticRegression(penalty = "l1",random_state=7,solver="liblinear")

param_dist = {"C": sp_randint(1, 1000),
              "class_weight": [None,"balanced"]}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = LogisticRegression(penalty = "l1",C=best["C"],class_weight=best["class_weight"],random_state=7,solver="liblinear")
cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)
# cl_pred_proba = cross_val_predict(cl, x_test, y_test, cv=10,method='predict_proba')

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'C': 45, 'class_weight': 'balanced'}

auc after 10-fold cv: 0.5988258332657548, SD: 0.037285469265843654


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),2868,3307
Truth(not given),122,257


**randomforest**

In [36]:
model = RandomForestClassifier(random_state=7)

param_dist = {"max_features": list(np.arange(1,10))+[None,"sqrt"],
              "max_depth":list(np.arange(3,30))+[None],
              "class_weight":[None,"balanced"],
              "n_estimators": np.arange(10,150,5)}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = RandomForestClassifier(n_estimators=best["n_estimators"],max_depth=best["max_depth"],max_features=best["max_features"],class_weight=best["class_weight"],random_state=7)

cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'n_estimators': 90, 'max_features': 5, 'max_depth': 14, 'class_weight': None}

auc after 10-fold cv: 0.73448555109605, SD: 0.052003034904018065


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),6144,31
Truth(not given),350,29


**gradient boosting**

In [37]:
model = GradientBoostingClassifier(random_state=om_state=7)

param_dist = {"max_features": list(np.arange(1,10))+[None,"sqrt"],
              "max_depth":list(np.arange(3,30))+[None],
              "n_estimators": np.arange(10,150,5)}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = GradientBoostingClassifier(n_estimators=best["n_estimators"],max_depth=best["max_depth"],max_features=best["max_features"],random_state=7)
cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'n_estimators': 105, 'max_features': 6, 'max_depth': 13}

auc after 10-fold cv: 0.7344397212025491, SD: 0.04791514453750168


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),6090,85
Truth(not given),315,64


**Lets see the auc on the kaggle test set**

In [13]:
# 59.2%

final_model = LogisticRegression(penalty="l1",C=45,class_weight="balanced",random_state=7,solver="liblinear")
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
#submit.to_csv('PT1-lr.csv',index=False)

In [14]:
# 86.5%

final_model = RandomForestClassifier(n_estimators=90, random_state=7, max_features=5, max_depth=14)
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
#submit.to_csv('PT1-rf.csv',index=False)

In [15]:
# 85.4%

final_model = GradientBoostingClassifier(n_estimators=105,max_depth=13,max_features=6,random_state=7)
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
#submit.to_csv('PT1-gb.csv',index=False)

**Lets see another methodologies for encoding. Before that lets obtain our orignal train and test set.**

In [16]:
train = train[train.columns[:10]]
test = test[test.columns[:10]]

# PART 2
This is another way to encode features having high cardinality, also known as mean target encoding.<br>Here we will groupby the target which is binary (1,0) and find the probability of label being 1. Finally we will replace all the labels with this probability. We will do this for all features. Eg: we have a feature that has the folowing labels: A,B,C and the size of the dataset is 1000. Lets suppose A appears 600 times(out of which 400 times it has target 1 and 200 times 0), B appears 200 times(150 times 1 and 50 times 0) and C appears 200 times(130 times 1 and 70 times 0). A will be replaced by 400/600 = 0.66, B by 150/200 = 0.75 and C by 130/200 = 0.65. In part 1 methodology we se saw that if two labels have same count they will be labelled with the same numeric value resulting in information loss in that feature. However, here we see that even though B and C have same count they have different numeric values because they are grouped by target as a result after grouping the count of label being B and target being 1 is 150 and label being C and target being 1 is 130. Hence the probabilities are different. There can be cases where the total count of the label and count of label being 1 is same which will result in same numeric value for both the labels but likelihood of that happening would be low as compared to part 1 methodology. Lets see if this can improve the model performance:  

In [17]:
def mean_enc(col):
    df = train[[col,"action"]].groupby(by=col).mean()["action"]
    train[col+"--new"] = train[col].map(df)
    test[col+"--new"] = test[col].map(df)

In [18]:
for cols in train.columns[1:]:
    mean_enc(cols)

In [19]:
for col in train.columns[10:]:
    print(train[col].value_counts())

1.000000    15888
0.996424      839
0.966942      484
0.900000      460
0.953545      409
            ...  
0.545455       11
0.250000        8
0.125000        8
0.166667        6
0.200000        5
Name: resource--new, Length: 139, dtype: int64
1.000000    22011
0.750000      328
0.666667      306
0.937500      288
0.857143      287
            ...  
0.100000       10
0.125000        8
0.375000        8
0.142857        7
0.166667        6
Name: mgr_id--new, Length: 156, dtype: int64
0.949222    21407
1.000000      888
0.962264      742
0.963939      721
0.951807      498
            ...  
0.920000       25
0.761905       21
0.941176       17
0.937500       16
0.000000        1
Name: role_rollup_1--new, Length: 72, dtype: int64
0.956148    4424
0.969075    3945
0.954563    2641
0.957205    2547
0.888889    2124
            ... 
0.761905      21
0.850000      20
0.937500      16
0.933333      15
0.000000       2
Name: role_rollup_2--new, Length: 91, dtype: int64
1.000000    3517
0.937445

**Here we will replace all the newly encountered values in test set by global target mean of the training set. Since the newly encountered values in test set will not be mapped therefore they will be replaced by nan. We will replace all the nan values in the test set by global target mean.**

In [20]:
for col in test.columns[10:]:
    test[col][test[col].isnull()] = train["action"].mean()

In [21]:
for col in train.columns[1:]:
    print(col+" has "+str(train[col].nunique())+" unique labels")

resource has 7518 unique labels
mgr_id has 4243 unique labels
role_rollup_1 has 128 unique labels
role_rollup_2 has 177 unique labels
role_deptname has 449 unique labels
role_title has 343 unique labels
role_family_desc has 2358 unique labels
role_family has 67 unique labels
role_code has 343 unique labels
resource--new has 139 unique labels
mgr_id--new has 156 unique labels
role_rollup_1--new has 72 unique labels
role_rollup_2--new has 91 unique labels
role_deptname--new has 201 unique labels
role_title--new has 126 unique labels
role_family_desc--new has 138 unique labels
role_family--new has 47 unique labels
role_code--new has 126 unique labels


**Again the count of a lot of labels was same as a result we ended up labelling them with the same numeric value, thereby grouping them together. Again we see some information loss but here the loss will be less than part 1. lets see the model performance:**

In [22]:
x = train[train.columns[10:]]
y = train["action"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100, stratify=y)

f_test = test[test.columns[10:]]

**Logistic regression**

In [20]:
model = LogisticRegression(penalty = "l1",random_state=7,solver="liblinear")

param_dist = {"C": sp_randint(1, 1000),
              "class_weight": [None,"balanced"]}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = LogisticRegression(penalty = "l1",C=best["C"],class_weight=best["class_weight"],random_state=7,solver="liblinear")
cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'C': 45, 'class_weight': 'balanced'}

auc after 10-fold cv: 0.9749136224630212, SD: 0.007805963143871724


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),5727,448
Truth(not given),45,334


**Randomforest**

In [21]:
model = RandomForestClassifier(random_state=7)

param_dist = {"max_features": list(np.arange(1,10))+[None,"sqrt"],
              "max_depth":list(np.arange(3,30))+[None],
              "class_weight":[None,"balanced"],
              "n_estimators": np.arange(10,150,5)}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = RandomForestClassifier(n_estimators=best["n_estimators"],max_depth=best["max_depth"],max_features=best["max_features"],class_weight=best["class_weight"],random_state=7)

cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'n_estimators': 45, 'max_features': 5, 'max_depth': 11, 'class_weight': None}

auc after 10-fold cv: 0.9825065932157294, SD: 0.00797892676323939


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),6105,70
Truth(not given),125,254


**Gradient boosting**

In [22]:
model = GradientBoostingClassifier(random_state=7)

param_dist = {"max_features": list(np.arange(1,10))+[None,"sqrt"],
              "max_depth":list(np.arange(3,30))+[None],
              "n_estimators": np.arange(10,150,5)}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = GradientBoostingClassifier(n_estimators=best["n_estimators"],max_depth=best["max_depth"],max_features=best["max_features"],random_state=7)
cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'n_estimators': 125, 'max_features': 'sqrt', 'max_depth': 5}

auc after 10-fold cv: 0.98632094850416, SD: 0.004090922485176993


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),6103,72
Truth(not given),122,257


**Looking at the above performance, all the 3 models give good AUC. Lets see the auc on the kaggle test set**

In [23]:
# 85.45% (massive improvement with logistic regression, auc went up to 85.45% from 59% in previous part)

final_model = LogisticRegression(penalty = "l1",C=45,class_weight="balanced",random_state=7,solver="liblinear")
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
#submit.to_csv('PT2-lr.csv',index=False)

In [24]:
# 77.6% (AUC decresed from 86.5% in part1)

final_model = RandomForestClassifier(n_estimators=45,max_depth=11,max_features=5,random_state=7)
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
#submit.to_csv('PT2-rf.csv',index=False)

In [25]:
# 83.46% (almost same as part1)

final_model = GradientBoostingClassifier(n_estimators=125,max_depth=5,max_features="sqrt",random_state=7)
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
#submit.to_csv('PT2-gb.csv',index=False)

**Surprisingly, the kaggle AUC score was expected to be much better since the training performance was remarkably better than part 1. Logistic regression model that completely bombed in part 1 showed a massive improvement in performance with this kind of encoding while at the same time randomforest went down to 77.7% from 85% in part 1. Gradient boosting gave a consistent performance. Also, this methodology is prone to overfitting. The main reason for this is that here we have a strong assumption that the distribution of the categorical variables is same in both test and train test. Clearly, this is not the case. We have seen above that how there are a lot of labels in the test set that are not present in train set (we replaced those labels with global target mean). Other than this, the distribution of the labels that are present in both sets still have somewhat different distribution. It might happen that a label occurs a lot of times in train set but not many times in the test set or there may be a case that a label has higher target 1 probability but a lower probabilty of being target=1 in the test set**

In [26]:
train = train[train.columns[:10]]
test = test[test.columns[:10]]

# PART 3
**kfold mean target encoding. In an attempt to combat the overfitting in the above methodology we will use k-fold target encoding in this part.**<br><br>
https://medium.com/@pouryaayria/k-fold-target-encoding-dfe9a594874b#:~:text=In%20the%20mean%2Dtarget%20encoding,the%20target%20corresponding%20to%20them.&text=However%2C%20this%20approach%20might%20have,test%20dataset%20are%20considerably%20different.<br><br>
https://necromuralist.github.io/kaggle-competitions/posts/mean-encoding/

In [28]:
x=train[train.columns[1:]]
y=train["action"]

In [29]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
tt = pd.DataFrame()

for tr_idx, val_idx in folds.split(x,y):
    
    train_fold = train.iloc[tr_idx]
    val_fold = train.iloc[val_idx]
    
    for cols in train.columns[1:]:
        mappings = train_fold.groupby(by=cols).mean()['action']
        val_fold[cols+"--new"] = val_fold[cols].map(mappings)
        val_fold[cols+"--new"][val_fold[cols+"--new"].isnull()] = train_fold["action"].mean()
    
    #display(val_fold)
    tt = tt.append(val_fold) #merging all validation sets
    

In [30]:
def kfold_mean_enc(col):
    mapp = tt.groupby(by=col).mean()[col+"--new"]
    train[col+"--new"] = train[col].map(mapp)
    test[col+"--new"] = test[col].map(mapp)

In [31]:
for cols in train.columns[1:]:
    kfold_mean_enc(cols)

**Again replace all the newly encountered values in test set by global target mean of the training set. Since the newly encountered values in test set will not be mapped therefore they will be replaced by nan. We will replace all the nan values in the test set by global target mean.**

In [32]:
for col in test.columns[10:]:
    test[col][test[col].isnull()] = train["action"].mean()

In [33]:
x = train[train.columns[10:]]
y = train["action"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100, stratify=y)

f_test = test[test.columns[10:]]

**logistic regression**

In [241]:
model = LogisticRegression(penalty = "l1",random_state=7,solver="liblinear")

param_dist = {"C": sp_randint(1, 1000),
              "class_weight": [None,"balanced"]}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = LogisticRegression(penalty = "l1",C=best["C"],class_weight=best["class_weight"],random_state=7,solver="liblinear")
cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'C': 45, 'class_weight': 'balanced'}

auc after 10-fold cv: 0.9590192393464465, SD: 0.010572248692860282


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),5619,556
Truth(not given),56,323


**randomforest**

In [243]:
model = RandomForestClassifier(random_state=7)

param_dist = {"max_features": list(np.arange(1,10))+[None,"sqrt"],
              "max_depth":list(np.arange(3,30))+[None],
              "class_weight":[None,"balanced"],
              "n_estimators": np.arange(10,150,5)}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = RandomForestClassifier(n_estimators=best["n_estimators"],max_depth=best["max_depth"],max_features=best["max_features"],class_weight=best["class_weight"],random_state=7)

cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'n_estimators': 45, 'max_features': 5, 'max_depth': 11, 'class_weight': None}

auc after 10-fold cv: 0.9716647650641195, SD: 0.007982734046739


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),6090,85
Truth(not given),170,209


**gradient boost**

In [244]:
model = GradientBoostingClassifier(random_state=7)

param_dist = {"max_features": list(np.arange(1,10))+[None,"sqrt"],
              "max_depth":list(np.arange(3,30))+[None],
              "n_estimators": np.arange(10,150,5)}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = GradientBoostingClassifier(n_estimators=best["n_estimators"],max_depth=best["max_depth"],max_features=best["max_features"],random_state=7)
cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'n_estimators': 125, 'max_features': 'sqrt', 'max_depth': 5}

auc after 10-fold cv: 0.9730005021709509, SD: 0.006517650044158913


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),6083,92
Truth(not given),174,205


**lets evaluate the model on the final kaggle test set**

In [34]:
# 86.1%

final_model = LogisticRegression(penalty = "l1",C=45,class_weight="balanced",random_state=7,solver="liblinear")
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
# submit.to_csv('PT3-lr.csv',index=False)

In [37]:
# 77.2% (almost same as part2)

final_model = RandomForestClassifier(n_estimators=45,max_depth=11,max_features=5,random_state=7)
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
# submit.to_csv('PT3-rf.csv',index=False)

In [36]:
# 83.2% (almost same as part2)

final_model = GradientBoostingClassifier(n_estimators=125,max_depth=5,max_features="sqrt",random_state=7)
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
# submit.to_csv('PT3-gb.csv',index=False)

**we get almost similar results. The distribution in test set is very different from train set**

In [38]:
train = train[train.columns[:10]]
test = test[test.columns[:10]]

# PART 4
**weight of evidence. WOE encodes each feature using the following formula:**<br><br>Lets suppose we have a categorical feature and we are encoding say label 5 (l5) of that feature<br>  
vi = log((pi / p) / (ni / n))   where<br>
pi = number of l5's with target class 1<br> 
ni = number of l5's with target class 0<br>
p = total number of records with class 1<br>
n = total number of records with class 0

In [40]:
import category_encoders as ce

cols = [col for col in train.columns if col != 'action']
woe_encoder = ce.WOEEncoder(cols=cols)

In [41]:
x = train[cols]
y = train["action"]
f_test = test[test.columns[1:]]

for c in cols:
    x[c] = x[c].astype("str")
    f_test[c] = f_test[c].astype("str")

In [42]:
x = woe_encoder.fit_transform(x, y)
f_test = woe_encoder.transform(f_test)

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100, stratify=y)

**logistic regression**

In [73]:
model = LogisticRegression(penalty = "l1",random_state=7,solver="liblinear")

param_dist = {"C": sp_randint(1, 1000),
              "class_weight": [None,"balanced"]}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = LogisticRegression(penalty = "l1",C=best["C"],class_weight=best["class_weight"],random_state=7,solver="liblinear")
cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'C': 580, 'class_weight': 'balanced'}

auc after 10-fold cv: 0.9202257815971088, SD: 0.018717648855262038


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),5258,917
Truth(not given),61,318


**random forest**

In [75]:
model = RandomForestClassifier(random_state=7)

param_dist = {"max_features": list(np.arange(1,10))+[None,"sqrt"],
              "max_depth":list(np.arange(3,30))+[None],
              "class_weight":[None,"balanced"],
              "n_estimators": np.arange(10,150,5)}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = RandomForestClassifier(n_estimators=best["n_estimators"],max_depth=best["max_depth"],max_features=best["max_features"],class_weight=best["class_weight"],random_state=7)

cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'n_estimators': 45, 'max_features': 5, 'max_depth': 11, 'class_weight': None}

auc after 10-fold cv: 0.9258807537774327, SD: 0.019822241922205268


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),6092,83
Truth(not given),208,171


**gradient boost**

In [76]:
model = GradientBoostingClassifier(random_state=7)

param_dist = {"max_features": list(np.arange(1,10))+[None,"sqrt"],
              "max_depth":list(np.arange(3,30))+[None],
              "n_estimators": np.arange(10,150,5)}
 
randomCV = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20,cv=10,random_state=7,scoring='roc_auc')
randomCV.fit(x_train, y_train)


best = randomCV.best_params_
print(best)
print("")



cl = GradientBoostingClassifier(n_estimators=best["n_estimators"],max_depth=best["max_depth"],max_features=best["max_features"],random_state=7)
cl_roc = cross_val_score(cl, x_test, y_test, cv=10,scoring="roc_auc")
cl_pred = cross_val_predict(cl, x_test, y_test, cv=10)

print("auc after 10-fold cv: "+str(cl_roc.mean())+", SD: "+str(cl_roc.std()))

mat_train = confusion_matrix(y_test,cl_pred,labels=[1,0])
cl_cm = pd.DataFrame(mat_train, index = [i for i in ["Truth(given)","Truth(not given)"]],
                  columns = [i for i in ["Predict(given)","Predict(not given)"]])
cl_cm

{'n_estimators': 125, 'max_features': 'sqrt', 'max_depth': 5}

auc after 10-fold cv: 0.9309077636290798, SD: 0.018294911886672552


Unnamed: 0,Predict(given),Predict(not given)
Truth(given),6081,94
Truth(not given),205,174


**lets evaluate the model on the final kaggle test set**

In [44]:
# 83.2%

final_model = LogisticRegression(penalty = "l1",C=580,class_weight="balanced",random_state=7,solver="liblinear")
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
#submit.to_csv('PT4-lr.csv',index=False)

In [45]:
# 86%

final_model = RandomForestClassifier(n_estimators=45,max_depth=11,max_features=5,random_state=7)
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
#submit.to_csv('PT4-rf.csv',index=False)

In [46]:
# 86.65%

final_model = GradientBoostingClassifier(n_estimators=125,max_depth=5,max_features="sqrt",random_state=7)
final_model.fit(x,y)
final_predictions = final_model.predict_proba(f_test)[:, 1]

submit = pd.DataFrame(columns=["Id","Action"])
submit["Action"] = final_predictions
submit["Id"] = test["id"]
#submit.to_csv('PT4-gb.csv',index=False)

**Weight of evidence gave the most consistent results followed by k-fold target encoding. In k-fold target encoding randomforest did not perform well but with WOE all the models gave good performance.**