In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline

In [2]:
train = pd.read_csv("dataset/train.csv", low_memory=False)
test = pd.read_csv("dataset/test.csv", low_memory=False)

In [3]:
train.set_index('UCIC_ID', inplace=True)

In [4]:
features = train.iloc[:, :-1]
labels = train.iloc[:,-1].values
test_ucic_id = test['UCIC_ID']

In [5]:
train.shape, test.shape

((300000, 376), (200000, 376))

In [38]:
features.dropna(axis=1, how='all', inplace=True)
features.dropna(axis=0, how='all', inplace=True)
#features.drop('city', axis=1, inplace=True)
features.drop('zip', axis=1, inplace=True)

In [7]:
features.shape

(300000, 371)

In [19]:
frac = len(features) * 0.7
features.dropna(thresh=frac, axis=1, inplace=True)
features.shape

(300000, 214)

In [20]:
null_columns = features.columns[features.isnull().any()]
features[null_columns].isnull().sum()

OCCUP_ALL_NEW               96
city                      5701
dependents               26382
zip                       1050
FINAL_WORTH_prev1          186
ENGAGEMENT_TAG_prev1       186
Recency_of_CR_TXN        33194
Recency_of_DR_TXN        23252
Recency_of_BRANCH_TXN    18368
Recency_of_Activity        477
dtype: int64

In [21]:
obj = features.select_dtypes(include=[object]).columns

feature_indices = []
for col in obj:
    feature_indices.append(features.columns.get_loc(col))

In [22]:
for col in features.columns.values:
    if features[col].dtypes != 'object':
        features[col].fillna(features[col].median(), inplace=True)

In [36]:
for col in features.columns.values:
    if features[col].dtypes == 'object':
        features[col].fillna(features[col].value_counts().index[0], inplace=True)

In [39]:
all_columns = train.columns.tolist()
new_columns = features.columns.tolist()
len(all_columns), len(new_columns)

(376, 213)

In [40]:
labelencoder = LabelEncoder()

for col in features.columns.values:
    if features[col].dtypes == 'object':
        features[col] = labelencoder.fit_transform(features[col])

In [14]:
std_scalar = StandardScaler()
features = std_scalar.fit_transform(features)

In [42]:
train_X, validate_X, train_y, validate_y = train_test_split(features, labels, test_size=0.33, random_state=42)

In [43]:
# Trying different classifiers
ada_clf = AdaBoostClassifier()
gbc_clf = GradientBoostingClassifier()
ran_clf = RandomForestClassifier()

In [44]:
ada_clf.fit(train_X, train_y)
test_pred = ada_clf.predict(validate_X)
print("Ada boost accuracy: {}".format(accuracy_score(validate_y, test_pred)))

Ada boost accuracy: 0.8683131313131313


In [45]:
gbc_clf.fit(train_X, train_y)
test_pred = gbc_clf.predict(validate_X)
print("Gradient boost accuracy: {}".format(accuracy_score(validate_y, test_pred)))

Gradient boost accuracy: 0.8785858585858586


In [46]:
ran_clf.fit(train_X, train_y)
test_pred = ran_clf.predict(validate_X)
print("Random Forest accuracy: {}".format(accuracy_score(validate_y, test_pred)))

Random Forest accuracy: 0.8663131313131314


In [21]:
# keep only the columns that were present in the training set
to_be_removed = [x for x in all_columns if x not in new_columns]
to_be_removed.remove('Responders')

In [22]:
test.drop(to_be_removed, axis=1, inplace=True)

In [23]:
obj_test = test.select_dtypes(include=[object]).columns

feature_indices_test = []
for col in obj_test:
    feature_indices_test.append(test.columns.get_loc(col))

In [24]:
for col in test.columns.values:
    if test[col].dtypes != 'object':
        test[col].fillna(test[col].mean(), inplace=True)

In [25]:
for col in test.columns.values:
    if test[col].dtypes == 'object':
        test[col].fillna(test[col].value_counts().index[0], inplace=True)

In [26]:
labelencoder = LabelEncoder()

for col in test.columns.values:
    if test[col].dtypes == 'object':
        test[col] = labelencoder.fit_transform(test[col])

In [30]:
predicted = gbc_clf.predict(test)

In [32]:
submission = pd.DataFrame()

In [35]:
submission['UCIC_ID'] = test_ucic_id
submission['Responders'] = predicted

In [39]:
len(submission[submission.Responders == 0])

118041

In [40]:
len(submission[submission.Responders == 1])

81959

In [41]:
len(submission)

200000

In [42]:
submission.to_csv("submission1.csv", index=False)

In [48]:
new_columns

['NO_OF_Accs',
 'HNW_CATEGORY',
 'vintage',
 'OCCUP_ALL_NEW',
 'city',
 'dependents',
 'FINAL_WORTH_prev1',
 'ENGAGEMENT_TAG_prev1',
 'C_prev1',
 'D_prev1',
 'ATM_C_prev1',
 'ATM_D_prev1',
 'BRANCH_C_prev1',
 'BRANCH_D_prev1',
 'IB_C_prev1',
 'IB_D_prev1',
 'MB_C_prev1',
 'MB_D_prev1',
 'POS_C_prev1',
 'POS_D_prev1',
 'count_C_prev1',
 'count_D_prev1',
 'COUNT_ATM_C_prev1',
 'COUNT_ATM_D_prev1',
 'COUNT_BRANCH_C_prev1',
 'COUNT_BRANCH_D_prev1',
 'COUNT_IB_C_prev1',
 'COUNT_IB_D_prev1',
 'COUNT_MB_C_prev1',
 'COUNT_MB_D_prev1',
 'COUNT_POS_C_prev1',
 'COUNT_POS_D_prev1',
 'CNR_prev1',
 'BAL_prev1',
 'EOP_prev1',
 'CR_AMB_Prev1',
 'C_prev2',
 'D_prev2',
 'ATM_C_prev2',
 'ATM_D_prev2',
 'BRANCH_C_prev2',
 'BRANCH_D_prev2',
 'IB_C_prev2',
 'IB_D_prev2',
 'MB_C_prev2',
 'MB_D_prev2',
 'POS_C_prev2',
 'POS_D_prev2',
 'count_C_prev2',
 'count_D_prev2',
 'COUNT_ATM_C_prev2',
 'COUNT_ATM_D_prev2',
 'COUNT_BRANCH_C_prev2',
 'COUNT_BRANCH_D_prev2',
 'COUNT_IB_C_prev2',
 'COUNT_IB_D_prev2',
 'COUN