In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(27)

In [2]:
train = pd.read_csv('criminal_train.csv')
test = pd.read_csv('criminal_test.csv')

train.head()

Unnamed: 0,PERID,IFATHER,NRCH17_2,IRHHSIZ2,IIHHSIZ2,IRKI17_2,IIKI17_2,IRHH65_2,IIHH65_2,PRXRETRY,...,TOOLONG,TROUBUND,PDEN10,COUTYP2,MAIIN102,AIIND102,ANALWT_C,VESTR,VEREP,Criminal
0,25095143,4,2,4,1,3,1,1,1,99,...,1,2,1,1,2,2,3884.805998,40026,1,0
1,13005143,4,1,3,1,2,1,1,1,99,...,2,2,2,3,2,2,1627.108106,40015,2,1
2,67415143,4,1,2,1,2,1,1,1,99,...,2,2,2,3,2,2,4344.95798,40024,1,0
3,70925143,4,0,2,1,1,1,1,1,99,...,2,2,1,1,2,2,792.521931,40027,1,0
4,75235143,1,0,6,1,4,1,1,1,99,...,2,2,2,2,2,2,1518.118526,40001,2,0


In [3]:
print("Training samples =", train.shape[0])
print("Testing samples =", test.shape[0])

Training samples = 45718
Testing samples = 11430


In [4]:
labels = train.pop('Criminal')
data = pd.concat([train, test])

test_id = test.PERID
train_rows = train.shape[0]
data.drop('PERID', axis=1, inplace=True)
data.shape

(57148, 70)

## Data Preprocessing

In [5]:
for col in data.columns:
    data[col] = data[col].replace(-1, np.nan)

data.isnull().sum()

IFATHER        2
NRCH17_2      92
IRHHSIZ2       2
IIHHSIZ2       2
IRKI17_2       2
IIKI17_2       2
IRHH65_2       2
IIHH65_2       2
PRXRETRY       2
PRXYDATA       2
MEDICARE       2
CAIDCHIP       2
CHAMPUS        2
PRVHLTIN       2
GRPHLTIN       2
HLTINNOS       2
HLCNOTYR       2
HLCNOTMO       2
HLCLAST        2
HLLOSRSN       2
HLNVCOST       2
HLNVOFFR       2
HLNVREF        2
HLNVNEED       2
HLNVSOR        2
IRMCDCHP       2
IIMCDCHP       2
IRMEDICR       2
IIMEDICR       2
IRCHMPUS       2
            ... 
OTHINS         2
CELLNOTCL      2
CELLWRKNG      2
IRFAMSOC       2
IIFAMSOC       2
IRFAMSSI       2
IIFAMSSI       2
IRFSTAMP       2
IIFSTAMP       2
IRFAMPMT       2
IIFAMPMT       2
IRFAMSVC       2
IIFAMSVC       2
IRWELMOS       2
IIWELMOS       2
IRPINC3        2
IRFAMIN3       2
IIPINC3        2
IIFAMIN3       2
GOVTPROG       2
POVERTY3     419
TOOLONG        2
TROUBUND       2
PDEN10         2
COUTYP2        2
MAIIN102       2
AIIND102       2
ANALWT_C      

In [6]:
from sklearn.preprocessing import Imputer

im = Imputer(strategy='most_frequent')

for col in data.columns:
    data[col] = im.fit_transform(data[[col]])

In [7]:
tmp = np.log1p(data['ANALWT_C'])
data = data.drop(['ANALWT_C'], axis=1)

cols = []
for col in data.columns:
    if col[:2] != 'HL':
        cols.append(col)
print("New Number of Columns =",len(cols))

data = data[cols]

New Number of Columns = 57


In [8]:
data = pd.get_dummies(data=data, columns=data.columns)

In [9]:
data['ANALWT_C'] = tmp

In [10]:
train = data[:train_rows]
test = data[train_rows:]

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split, cross_val_score


X_train, X_val, y_train, y_val = train_test_split(train, labels, test_size=0.20)
clf1 = RandomForestClassifier(n_estimators=100)
clf2 = XGBClassifier()
clf3 = GaussianNB()
clf = VotingClassifier(estimators=[('rf', clf1), ('xg', clf2), ('gnb', clf3)], voting='hard')

In [12]:
clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we... scale_pos_weight=1, seed=None,
       silent=True, subsample=1)), ('gnb', GaussianNB(priors=None))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [13]:
from sklearn.metrics import make_scorer, matthews_corrcoef

scorer = make_scorer(matthews_corrcoef)
score = np.mean(cross_val_score(clf, X_val, y_val, scoring=scorer, cv=3))

In [14]:
score

0.61891125541624359

In [15]:
clf.fit(train, labels)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we... scale_pos_weight=1, seed=None,
       silent=True, subsample=1)), ('gnb', GaussianNB(priors=None))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [16]:
score = np.mean(cross_val_score(clf, X_val, y_val, scoring=scorer, cv=3))
score

0.6190838192271173

In [17]:
d = {}
for k, v in zip(train.columns, clf.feature_importances_):
    d[k] = v

AttributeError: 'VotingClassifier' object has no attribute 'feature_importances_'

In [None]:
import operator
d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
for k, v in d:
    print("%15s   =   %0.10f" %(k, v))

In [18]:
predictions = clf.predict(test)

In [19]:
submission = pd.DataFrame({'PERID': test_id, 'Criminal':predictions})
submission = submission[['PERID', 'Criminal']]
submission.head()

Unnamed: 0,PERID,Criminal
0,66583679,0
1,35494679,0
2,79424679,0
3,11744679,0
4,31554679,0


In [20]:
submission.to_csv('predictions_vc.csv', index=False)