In [94]:
# from __future__ import print_function

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import datasets, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.learning_curve import learning_curve, validation_curve
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

pd.options.display.max_rows=60 #60
pd.options.display.max_columns=20 #20

In [95]:
# load data 

trainDF = pd.read_csv("~/Documents/kaggle/bnp/data/train.csv")
testDF = pd.read_csv("~/Documents/kaggle/bnp/data/test.csv")

In [96]:
# eliminate na value more than 120 columns
trainDF = trainDF.dropna(thresh=120)
testDF = testDF.dropna(thresh=120)

In [97]:
testDF.head()

Unnamed: 0,ID,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,0,1.375465,11.361141,C,4.200778,6.577,2.081784,1.784386,0.011094,9.52381,...,7.619048,1.815241,1.11227e-07,AF,1.292368,3.903345,1.485925,0,2.333334,1.428572
2,2,-4.903407e-07,8.201529,C,4.544371,6.5501,1.558442,2.467532,0.007164,7.142858,...,5.714286,1.970928,0.01412265,AV,1.128724,5.844156,1.475892,0,1.263157,-6.380022e-07
3,7,2.66187,3.041241,C,1.657216,9.77308,2.078337,1.430855,1.252157,7.959596,...,4.40404,8.163614,1.100329,B,1.988688,1.558753,2.448814,0,5.385474,1.493777
4,10,1.252822,11.283352,C,4.638388,8.52051,2.302484,3.510159,0.074263,7.612904,...,6.580644,1.325654,0.2584588,A,1.863796,2.666478,2.374275,0,0.681672,2.264151
5,11,1.733601,7.525109,C,3.263905,5.608608,2.35609,2.369477,0.065481,9.935483,...,5.032257,2.551374,-9.391195e-07,BM,1.541607,3.463855,1.196959,0,2.598869,1.217392


In [98]:
trainDF.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
6,12,0,0.899806,7.312995,C,3.494148,9.9462,1.92607,1.770427,0.066251,...,3.476299,1.992594,0.083758,BJ,3.2761,1.623298,2.266575,0,2.263736,0.970873
8,22,0,2.078651,8.462619,,3.73903,5.265636,1.573033,2.303371,0.015869,...,8.148148,1.87556,0.018659,S,1.159637,5.582865,1.105283,0,1.170731,3.333334


In [99]:
# defining join, drop, imputation functions

def join(df, col):
    df = df.join(pd.get_dummies(df[col], prefix=col))
    df = df.drop(col, axis=1)
    return df

def drop(df, col):
    df = df.drop(col, axis=1)
    return df

def fillna(df, col):
    df[col] = df[col].fillna(df[col].median(axis=0))
    return df

In [100]:
# v3, 24 30 31 47 52 66 71 74 75 79 91 107 110
# drop 22 38 56 62 113 125 129

list_join = ['v3', 'v24', 'v30', 'v31', 'v47', 'v52', 'v66', 'v71', 'v74', 'v91']
list_drop = ['ID','v8', 'v22', 'v23', 'v25','v36','v37', 'v38', 'v46','v51','v53','v54', 'v56', 'v62','v63','v73','v75','v79','v81','v82','v89','v92','v95',
             'v105', 'v107','v108','v109','v110', 'v113', 'v112', 'v116','v117','v118','v119','v123','v124', 'v125', 'v128', 'v129']

# list_join = ['v3', 'v24', 'v30', 'v31', 'v47', 'v52', 'v66', 'v71', 'v74', 'v75', 'v79', 'v91', 'v107', 'v110']
# list_drop = ['ID', 'v22', 'v38', 'v56', 'v62', 'v113', 'v125', 'v112', 'v129']

trainDF = join(trainDF, list_join)
trainDF = drop(trainDF, list_drop)

testDF = join(testDF, list_join)
testDF = drop(testDF, list_drop)

# get columns index of null value
indexTrain = trainDF.isnull().any()
indexTest = testDF.isnull().any()

# get columns name index
columnsTrain = trainDF.columns
columnsTest = testDF.columns

# create null list
list_index_null_train = []
list_index_null_test = []
list_columns_null_train = []
list_columns_null_test = []

# append to null list index if nan
for i in range(len(indexTrain)):
    if indexTrain[i]: list_index_null_train.append(i)

for i in range(len(indexTest)):
    if indexTest[i]: list_index_null_test.append(i)

list_index_null_test = list(set(map(lambda x: x-1, list_index_null_train) + list_index_null_test))

# append columns name to list
list_columns_null_train = map(lambda x: (columnsTrain[x]), list_index_null_train)
list_columns_null_test = map(lambda x: (columnsTest[x]), list_index_null_test)

# imputation of null value
trainDF = fillna(trainDF, list_columns_null_train)
testDF = fillna(testDF, list_columns_null_test)

# re-shape test data columns
drop_columns = testDF.columns - trainDF.columns
testDF = drop(testDF, drop_columns)

drop_columns = []
drop_columns = (trainDF.columns - testDF.columns).drop('target')
trainDF = drop(trainDF, drop_columns)



In [101]:
# validate null value

print testDF.isnull().values.any()
print trainDF.isnull().values.any()

print list_index_null_test
print list_index_null_train

False
False
[3, 6, 7, 9, 11, 13, 16, 17, 18, 25, 28, 36, 44, 47, 48, 52, 53, 56, 58, 60, 64, 65, 69, 75, 77, 78, 81, 82]
[4, 7, 8, 10, 12, 14, 18, 19, 26, 29, 37, 45, 48, 49, 53, 54, 57, 59, 61, 66, 70, 76, 78, 79, 83]


In [102]:
# normalize using standard scaler

y_train = trainDF.iloc[:, 0].values
x_train = trainDF.ix[:, 'v1':]
x_test = testDF.ix[:, 'v1':]

sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)


In [103]:
# feature correlation / importance
forest = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
forest.fit(x_train, y_train)
importances = forest.feature_importances_
print(importances)
indices = np.argsort(importances)[::-1]
print indices
# features sort by importances

[  1.07165220e-02   1.03820670e-02   1.01728199e-02   1.12554438e-02
   1.15056669e-02   1.02438120e-02   1.01115680e-02   1.74620820e-02
   1.01544972e-02   1.94547409e-02   9.64235904e-03   1.58633304e-02
   1.01581942e-02   1.03915595e-02   9.02794899e-03   1.08320610e-02
   9.94473157e-03   1.01859251e-02   1.39156450e-02   9.98917842e-03
   9.94485834e-03   1.11537594e-02   8.75397764e-03   9.78315396e-03
   9.27365441e-03   1.45715754e-02   1.01002119e-02   1.03177115e-02
   1.44939833e-02   8.83410496e-03   9.86616162e-03   9.94916133e-03
   1.03638405e-02   1.03796148e-02   8.97799186e-03   9.24780890e-03
   5.50686905e-02   9.82545795e-03   1.08190156e-02   9.82041322e-03
   9.80329442e-03   1.00300957e-02   9.80701771e-03   8.88219619e-03
   9.32750840e-03   8.93643898e-03   1.09005717e-02   1.07482087e-02
   1.01931523e-02   3.38883265e-03   9.15175163e-03   9.19037986e-03
   1.07196029e-02   9.94344732e-03   9.45848248e-03   9.75432314e-03
   1.07189492e-02   9.94492554e-03

In [104]:

print(trainDF.columns - testDF.columns).drop('target')
### labels = trainDF.columns
# map(lambda x: x, x_train.shape[1])
# print trainDF.ix[:, 'v1':].shape[1]
# print testDF.ix[:,'':].shape[1]
# map(lambda x: x, trainDF.columns)
# map(lambda x: x, testDF.columns)
# print testDF.columns
# map(lambda x: print("`%2d. %-*s %f" % (x + 1, 30, labels[indices[x]], importances[indices[x]])), x_train.shape[1])

# for f in range(x_train.shape[1]):
#     print("%2d. %-*s %f" % (f + 1, 30, labels[indices[f]], importances[indices[f]]))

140
139
Index([], dtype='object')




In [105]:
x_train = forest.transform(x_train, threshold=.01)
x_test = forest.transform(x_test, threshold=.01)



In [None]:
svm = SVC(kernel='rbf', C=100.0, gamma=0.1, random_state=0)
lgr = LogisticRegression(C=1.0)
rfc = RandomForestClassifier(n_estimators=1000)

train_sizes, train_scores, test_scores = learning_curve(estimator=rfc, X=x_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=-1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

print train_sizes
print train_mean

plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
# plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
# plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.7, 1])
plt.show()

asd
