In [2]:
# from __future__ import print_function

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import datasets, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.learning_curve import learning_curve, validation_curve
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

pd.options.display.max_rows=60 #60
pd.options.display.max_columns=20 #20

In [3]:
# load data 

trainDF = pd.read_csv("~/Documents/kaggle/bnp/data/train.csv")
testDF = pd.read_csv("~/Documents/kaggle/bnp/data/test.csv")

In [4]:
# eliminate na value more than 120 columns
trainDF = trainDF.dropna(thresh=120)
testDF = testDF.dropna(thresh=120)

In [5]:
testDF.head()

Unnamed: 0,ID,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,0,1.375465,11.361141,C,4.200778,6.577,2.081784,1.784386,0.011094,9.52381,...,7.619048,1.815241,1.11227e-07,AF,1.292368,3.903345,1.485925,0,2.333334,1.428572
2,2,-4.903407e-07,8.201529,C,4.544371,6.5501,1.558442,2.467532,0.007164,7.142858,...,5.714286,1.970928,0.01412265,AV,1.128724,5.844156,1.475892,0,1.263157,-6.380022e-07
3,7,2.66187,3.041241,C,1.657216,9.77308,2.078337,1.430855,1.252157,7.959596,...,4.40404,8.163614,1.100329,B,1.988688,1.558753,2.448814,0,5.385474,1.493777
4,10,1.252822,11.283352,C,4.638388,8.52051,2.302484,3.510159,0.074263,7.612904,...,6.580644,1.325654,0.2584588,A,1.863796,2.666478,2.374275,0,0.681672,2.264151
5,11,1.733601,7.525109,C,3.263905,5.608608,2.35609,2.369477,0.065481,9.935483,...,5.032257,2.551374,-9.391195e-07,BM,1.541607,3.463855,1.196959,0,2.598869,1.217392


In [6]:
trainDF.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
6,12,0,0.899806,7.312995,C,3.494148,9.9462,1.92607,1.770427,0.066251,...,3.476299,1.992594,0.083758,BJ,3.2761,1.623298,2.266575,0,2.263736,0.970873
8,22,0,2.078651,8.462619,,3.73903,5.265636,1.573033,2.303371,0.015869,...,8.148148,1.87556,0.018659,S,1.159637,5.582865,1.105283,0,1.170731,3.333334


In [7]:
# defining join, drop, imputation functions

def join(df, col):
    df = df.join(pd.get_dummies(df[col], prefix=col))
    df = df.drop(col, axis=1)
    return df

def drop(df, col):
    df = df.drop(col, axis=1)
    return df

def fillna(df, col):
    df[col] = df[col].fillna(df[col].median(axis=0))
    return df

In [8]:
# v3, 24 30 31 47 52 66 71 74 75 79 91 107 110
# drop 22 38 56 62 113 125 129

list_join = ['v3', 'v24', 'v30', 'v31', 'v47', 'v52', 'v66', 'v71', 'v74', 'v91']
list_drop = ['ID','v8', 'v22', 'v23', 'v25','v36','v37', 'v38', 'v46','v51','v53','v54', 'v56', 'v62','v63','v73','v75','v79','v81','v82','v89','v92','v95',
             'v105', 'v107','v108','v109','v110', 'v113', 'v112', 'v116','v117','v118','v119','v123','v124', 'v125', 'v128', 'v129']

# list_join = ['v3', 'v24', 'v30', 'v31', 'v47', 'v52', 'v66', 'v71', 'v74', 'v75', 'v79', 'v91', 'v107', 'v110']
# list_drop = ['ID', 'v22', 'v38', 'v56', 'v62', 'v113', 'v125', 'v112', 'v129']

trainDF = join(trainDF, list_join)
trainDF = drop(trainDF, list_drop)

testDF = join(testDF, list_join)
testDF = drop(testDF, list_drop)

# get columns index of null value
indexTrain = trainDF.isnull().any()
indexTest = testDF.isnull().any()

# get columns name index
columnsTrain = trainDF.columns
columnsTest = testDF.columns

# create null list
list_index_null_train = []
list_index_null_test = []
list_columns_null_train = []
list_columns_null_test = []

# append to null list index if nan
for i in range(len(indexTrain)):
    if indexTrain[i]: list_index_null_train.append(i)

for i in range(len(indexTest)):
    if indexTest[i]: list_index_null_test.append(i)

list_index_null_test = list(set(map(lambda x: x-1, list_index_null_train) + list_index_null_test))

# append columns name to list
list_columns_null_train = map(lambda x: (columnsTrain[x]), list_index_null_train)
list_columns_null_test = map(lambda x: (columnsTest[x]), list_index_null_test)

# imputation of null value
trainDF = fillna(trainDF, list_columns_null_train)
testDF = fillna(testDF, list_columns_null_test)

In [9]:
# validate null value

print testDF.isnull().values.any()
print trainDF.isnull().values.any()

# print trainDF.columns.values

print list_index_null_test
print list_index_null_train

False
False
[3, 6, 7, 9, 11, 13, 16, 17, 18, 25, 28, 36, 44, 47, 48, 52, 53, 56, 58, 60, 64, 65, 69, 75, 77, 78, 81, 82]
[4, 7, 8, 10, 12, 14, 18, 19, 26, 29, 37, 45, 48, 49, 53, 54, 57, 59, 61, 66, 70, 76, 78, 79, 83]


In [10]:
# normalize using standard scaler

y_train = trainDF.iloc[:, 0].values
x_train = trainDF.ix[:, 'v1':]
x_test = testDF.ix[:, 'v1':]

sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)


In [14]:
# feature correlation / importance
forest = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
forest.fit(x_train, y_train)
importances = forest.feature_importances_
print(importances)
indices = np.argsort(importances)[::-1]
print indices
# features sort by importances

[  1.06972911e-02   1.03071932e-02   1.03147132e-02   1.12139410e-02
   1.15418821e-02   1.02631337e-02   1.01919230e-02   1.72935160e-02
   1.01976932e-02   1.92938087e-02   9.68607962e-03   1.59527968e-02
   1.02202720e-02   1.04001105e-02   9.08617107e-03   1.09092477e-02
   1.00517540e-02   1.01467494e-02   1.38778348e-02   1.00054284e-02
   1.00243982e-02   1.13361126e-02   8.77986146e-03   9.82676987e-03
   9.26235541e-03   1.44595442e-02   1.00928182e-02   1.01943268e-02
   1.43461838e-02   8.85516575e-03   9.81000338e-03   9.85142925e-03
   1.03431426e-02   1.04645299e-02   8.85375242e-03   9.36438865e-03
   5.51399202e-02   9.90420710e-03   1.08332255e-02   9.82179740e-03
   9.78837057e-03   1.00181633e-02   9.77559630e-03   8.88059981e-03
   9.37600599e-03   9.00391292e-03   1.09263641e-02   1.07373428e-02
   1.02086402e-02   3.37621269e-03   9.17007508e-03   9.18031621e-03
   1.07090379e-02   9.95691298e-03   9.43895256e-03   9.72421881e-03
   1.07693505e-02   1.00381721e-02

In [41]:
labels = trainDF.columns
# map(lambda x: x, x_train.shape[1])
print trainDF.ix[:, 'v1':].shape[1]
print testDF.ix[:,:].shape[1]
# map(lambda x: print("%2d. %-*s %f" % (x + 1, 30, labels[indices[x]], importances[indices[x]])), x_train.shape[1])

for f in range(x_train.shape[1]):
    print("%2d. %-*s %f" % (f + 1, 30, labels[indices[f]], importances[indices[f]]))

141
143
 1. v49                            0.055140
 2. v11                            0.019294
 3. v9                             0.017294
 4. v13                            0.015953
 5. v111                           0.014772
 6. v33                            0.014460
 7. v39                            0.014346
 8. v20                            0.013878
 9. v5                             0.011542
10. v115                           0.011402
11. v27                            0.011336
12. v47_B                          0.011331
13. v98                            0.011277
14. v87                            0.011238
15. v4                             0.011214
16. v101                           0.011139
17. v67                            0.010926
18. v17                            0.010909
19. v55                            0.010833
20. v84                            0.010769
21. v68                            0.010737
22. v77                            0.010709
23. target              

In [85]:
x_train = forest.transform(x_train, threshold=.01)
x_test = forest.transform(x_test, threshold=.01)



ValueError: X has different number of features than during model fitting.

In [None]:
x_