In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("data/train.csv")

In [7]:
df = df.replace(r'^\s*$', np.nan, regex=True)

In [8]:
df.isnull().sum()

ID                     0
STATE               1842
AGE                    0
OCCUPATION         42777
INCOME_GROUP        3437
CUSTOMER_SINCE         0
LOYALTY_PROGRAM    17824
PAST_PURCHASE          0
PURCHASE               0
dtype: int64

As we can see here, there are way too many NaN values in the occupation column. Most would recommend to drop that column, let's see what we get when we drop the column and fill the NaNs in other columns with NaN. 

In [10]:
# Saving the original df as og
og = df.copy()

In [17]:
#Dropping the Occupation and ID columns. 
df = df.drop(['ID','OCCUPATION'], axis = 1)

In [18]:
#Filling with mode for the other columns
df['STATE'].fillna(df['STATE'].mode()[0], inplace=True)
df['INCOME_GROUP'].fillna(df['INCOME_GROUP'].mode()[0], inplace=True)
df['LOYALTY_PROGRAM'].fillna(df['LOYALTY_PROGRAM'].mode()[0], inplace=True)

In [19]:
df

Unnamed: 0,STATE,AGE,INCOME_GROUP,CUSTOMER_SINCE,LOYALTY_PROGRAM,PAST_PURCHASE,PURCHASE
0,Uttar Pradesh,28,Low,2018,No,4937,No
1,Madhya Pradesh,27,Low,2018,No,10834,No
2,West Bengal,62,High,2013,No,8610,Yes
3,Tamil Nadu,27,Low,2018,No,14374,Yes
4,Uttar Pradesh,40,High,2015,No,13158,No
...,...,...,...,...,...,...,...
149995,Assam,29,Low,2017,No,6241,No
149996,Andhra Pradesh,53,Medium,2010,No,29447,No
149997,Odisha,31,Low,2018,No,4856,No
149998,Andhra Pradesh,54,High,2010,Yes,10031,Yes


In [20]:
df['LOYALTY_PROGRAM'] = df['LOYALTY_PROGRAM'].map({'Yes': 1, 'No': 0})
df['PURCHASE'] = df['PURCHASE'].map({'Yes': 1, 'No': 0})

In [21]:
df

Unnamed: 0,STATE,AGE,INCOME_GROUP,CUSTOMER_SINCE,LOYALTY_PROGRAM,PAST_PURCHASE,PURCHASE
0,Uttar Pradesh,28,Low,2018,0,4937,0
1,Madhya Pradesh,27,Low,2018,0,10834,0
2,West Bengal,62,High,2013,0,8610,1
3,Tamil Nadu,27,Low,2018,0,14374,1
4,Uttar Pradesh,40,High,2015,0,13158,0
...,...,...,...,...,...,...,...
149995,Assam,29,Low,2017,0,6241,0
149996,Andhra Pradesh,53,Medium,2010,0,29447,0
149997,Odisha,31,Low,2018,0,4856,0
149998,Andhra Pradesh,54,High,2010,1,10031,1


In [27]:
from sklearn.model_selection import train_test_split

In [31]:
#Split the dataset into 70(train) - 30(test)
X = df.drop(['PURCHASE'], axis=1)
y = df['PURCHASE']

In [30]:
import category_encoders as ce

In [32]:
# Define catboost encoder
cbe_encoder = ce.cat_boost.CatBoostEncoder()
  
# Fit encoder and transform the features
cbe_encoder.fit(X, y)
train_cbe = cbe_encoder.transform(X)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(train_cbe, y, test_size = 0.3, random_state = 0)

Now let's try all three state of the art models on this data and see which performs better. 

In [36]:
#xgboost
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [54]:
from xgboost import XGBClassifier
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [55]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [56]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
f1_score(y_test, predictions)

Accuracy: 76.49%


0.2663800873604659

In [57]:
#catboost
from catboost import CatBoostClassifier

In [58]:
model = CatBoostClassifier()

In [59]:
model.fit(X_train, y_train)

Learning rate set to 0.075159
0:	learn: 0.6563283	total: 28.5ms	remaining: 28.5s
1:	learn: 0.6266789	total: 53.3ms	remaining: 26.6s
2:	learn: 0.6021772	total: 81ms	remaining: 26.9s
3:	learn: 0.5810379	total: 111ms	remaining: 27.7s
4:	learn: 0.5641390	total: 141ms	remaining: 28s
5:	learn: 0.5496715	total: 170ms	remaining: 28.1s
6:	learn: 0.5379880	total: 197ms	remaining: 28s
7:	learn: 0.5278740	total: 226ms	remaining: 28.1s
8:	learn: 0.5194011	total: 253ms	remaining: 27.8s
9:	learn: 0.5128717	total: 281ms	remaining: 27.8s
10:	learn: 0.5073795	total: 302ms	remaining: 27.1s
11:	learn: 0.5025656	total: 330ms	remaining: 27.2s
12:	learn: 0.4984490	total: 357ms	remaining: 27.1s
13:	learn: 0.4949049	total: 386ms	remaining: 27.2s
14:	learn: 0.4917829	total: 417ms	remaining: 27.4s
15:	learn: 0.4896244	total: 445ms	remaining: 27.4s
16:	learn: 0.4873339	total: 470ms	remaining: 27.2s
17:	learn: 0.4851889	total: 497ms	remaining: 27.1s
18:	learn: 0.4835376	total: 524ms	remaining: 27s
19:	learn: 0.482

<catboost.core.CatBoostClassifier at 0x21def4d7850>

In [60]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [61]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
f1_score(y_test, predictions)

Accuracy: 76.59%


0.27409908358023843

In [45]:
from lightgbm import LGBMClassifier

In [47]:
model = LGBMClassifier()

In [48]:
model.fit(X_train, y_train)

LGBMClassifier()

In [49]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [50]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 76.62%


In [51]:
from sklearn.metrics import f1_score

In [53]:
f1_score(y_test, predictions)

0.2506233525682126

In this method, where we dropped the OCCUPATION column, we got a maximum accuracy of 76.62%. While the accuracy is not very bad, the f1 is bad. We need to look for other means to solve this problem. 