In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline

In [33]:
train = pd.read_csv("dataset/train.csv", low_memory=False)
tester = pd.read_csv("dataset/test.csv", low_memory=False)

In [3]:
features = train.iloc[:, :-1]
labels = train.iloc[:,-1].values

In [4]:
features.dropna(axis=1, how='all', inplace=True)
features.dropna(axis=0, how='all', inplace=True)
features.drop('city', axis=1, inplace=True)
features.drop('zip', axis=1, inplace=True)

In [5]:
features.shape

(300000, 370)

In [6]:
frac = len(features) * 0.7
features.dropna(thresh=frac, axis=1, inplace=True)
features.shape

(300000, 213)

In [7]:
null_columns = features.columns[features.isnull().any()]
features[null_columns].isnull().sum()

OCCUP_ALL_NEW               96
dependents               26382
FINAL_WORTH_prev1          186
ENGAGEMENT_TAG_prev1       186
Recency_of_CR_TXN        33194
Recency_of_DR_TXN        23252
Recency_of_BRANCH_TXN    18368
Recency_of_Activity        477
dtype: int64

In [8]:
obj = features.select_dtypes(include=[object]).columns

feature_indices = []
for col in obj:
    feature_indices.append(features.columns.get_loc(col))
feature_indices

[2, 4, 6, 7, 176, 188, 189, 190, 191, 193, 195]

In [9]:
for col in features.columns.values:
    if features[col].dtypes != 'object':
        features[col].fillna(features[col].mean(), inplace=True)

In [10]:
for col in features.columns.values:
    if features[col].dtypes == 'object':
        features[col].fillna(features[col].value_counts().index[0], inplace=True)

In [11]:
all_columns = train.columns.tolist()
del(train)
new_columns = features.columns.tolist()
len(new_columns)

213

In [12]:
labelencoder = LabelEncoder()

for col in features.columns.values:
    if features[col].dtypes == 'object':
        features[col] = labelencoder.fit_transform(features[col])

In [13]:
onehotencoder = OneHotEncoder(categorical_features = feature_indices)
features = onehotencoder.fit_transform(features).toarray()

In [14]:
std_scalar = StandardScaler()
features = std_scalar.fit_transform(features)

In [15]:
train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.33, random_state=42)

In [16]:
# Trying different classifiers
ada_clf = AdaBoostClassifier()
gbc_clf = GradientBoostingClassifier()
ran_clf = RandomForestClassifier()

In [17]:
ada_clf.fit(train_X, train_y)
test_pred = ada_clf.predict(test_X)
print("Ada boost accuracy: {}".format(accuracy_score(test_y, test_pred)))

Ada boost accuracy: 0.867989898989899


In [18]:
gbc_clf.fit(train_X, train_y)
test_pred = gbc_clf.predict(test_X)
print("Gradient boost accuracy: {}".format(accuracy_score(test_y, test_pred)))

Gradient boost accuracy: 0.8772323232323233


In [19]:
ran_clf.fit(train_X, train_y)
test_pred = ran_clf.predict(test_X)
print("Random Forest accuracy: {}".format(accuracy_score(test_y, test_pred)))

Random Forest accuracy: 0.8665050505050506


In [21]:
# keep only the columns that were present in the training set
to_be_removed = [x for x in all_columns if x not in new_columns]
to_be_removed.remove('Responders')

In [22]:
tester.drop(to_be_removed, axis=1, inplace=True)

In [23]:
obj_test = tester.select_dtypes(include=[object]).columns

feature_indices_test = []
for col in obj_test:
    feature_indices_test.append(tester.columns.get_loc(col))

In [24]:
for col in tester.columns.values:
    if tester[col].dtypes != 'object':
        tester[col].fillna(tester[col].mean(), inplace=True)

In [25]:
for col in tester.columns.values:
    if tester[col].dtypes == 'object':
        tester[col].fillna(tester[col].value_counts().index[0], inplace=True)

In [26]:
labelencoder = LabelEncoder()

for col in tester.columns.values:
    if tester[col].dtypes == 'object':
        tester[col] = labelencoder.fit_transform(tester[col])

In [27]:
onehotencoder = OneHotEncoder(categorical_features=feature_indices_test)
tester = onehotencoder.fit_transform(tester).toarray()

In [28]:
tester = std_scalar.fit_transform(tester)

In [30]:
predicted = gbc_clf.predict(tester)

In [32]:
submission = pd.DataFrame()

In [35]:
submission['UCIC_ID'] = tester['UCIC_ID']
submission['Responders'] = predicted

In [39]:
len(submission[submission.Responders == 0])

118041

In [40]:
len(submission[submission.Responders == 1])

81959

In [41]:
len(submission)

200000

In [42]:
submission.to_csv("submission1.csv", index=False)