## Cross Validation

In [1]:
import os
import pandas as pd
import numpy as np
import pylab as pl

from time import time
from IPython.core.display import Image

# Scikit-learn:
# # Model:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# # PCA:
from sklearn.decomposition import PCA, RandomizedPCA

# # Metrics:
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, f_classif

In [2]:
df = pd.read_csv('C:/Vindico/Projects/Data/Kaggle/Competition/otto group/train.csv')
df.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [3]:
df.target.unique()
df['target'] = df['target'].map({'Class_1': 0,'Class_2': 1, 'Class_3': 2, 'Class_4': 3, 'Class_5': 4
                                , 'Class_6': 5, 'Class_7': 6, 'Class_8': 7, 'Class_9': 8}).astype(int)

In [4]:
# df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# age_mean = df['Age'].mean()
# df['Age'] = df['Age'].fillna(age_mean)

# from scipy.stats import mode

# mode_embarked = mode(df['Embarked'])[0][0]
# df['Embarked'] = df['Embarked'].fillna(mode_embarked)

# df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)

# pd.get_dummies(df['Embarked'], prefix='Embarked').head(10)
# df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

# df = df.drop(['Sex', 'Embarked'], axis=1)

cols = df.columns.tolist()
cols = [cols[-1]] + cols[0:1] + cols[1:-1]
df = df[cols]

In [5]:
train_data = df.values[:30000]
train_data

array([[    0,     1,     1, ...,     0,     0,     0],
       [    0,     2,     0, ...,     0,     0,     0],
       [    0,     3,     0, ...,     0,     0,     0],
       ..., 
       [    4, 29998,     0, ...,     0,     1,     0],
       [    4, 29999,     0, ...,     0,     1,     0],
       [    4, 30000,     0, ...,     0,     0,     0]], dtype=int64)

In [6]:
X = train_data[:, 2:]
y = train_data[:, 0]

In [7]:
X = X.astype(float)
X.dtype


dtype('float64')

In [8]:
print np.amin(X)
print np.amax(X)

0.0
263.0


In [9]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)


In [10]:
print np.amin(X)
print np.amax(X)

0.0
1.0


In [11]:
selector = SelectPercentile(f_classif, percentile=10)
X = selector.fit_transform(X, y)

In [12]:
X.shape

(30000L, 10L)

In [13]:
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_pred = y.copy()
    
    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred


def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

In [14]:
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))

print "Logistic Regression:"
print "%.3f" % accuracy(y, run_cv(X,y,LogisticRegression))

print "Gradient Boosting Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,GradientBoostingClassifier))

print "Bagging Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,BaggingClassifier))

print "Extra Trees Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,ExtraTreesClassifier))

print "Decision Tree Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,DecisionTreeClassifier))

print "K-Neighbors Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,KNeighborsClassifier))

print "Random Forest Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,RandomForestClassifier))

print "AdaBoost Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,AdaBoostClassifier))

Support vector machines:
0.585
Logistic Regression:
0.594
Gradient Boosting Classifier:
0.633
Bagging Classifier:
0.621
Extra Trees Classifier:
0.622
Decision Tree Classifier:
0.614
K-Neighbors Classifier:
0.588
Random Forest Classifier:
0.622
AdaBoost Classifier:
0.608


In [15]:
cv = KFold(n=len(train_data), n_folds=10)

for training_set, test_set in cv:
    X_train = X[training_set]
    y_train = y[training_set]
    X_test = X[test_set]
    y_test = y[test_set]
    model = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    model.fit(X_train, y_train)
    y_prediction = model.predict(X_test)
    print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)
print np.mean(y_test == y_prediction)

prediction accuracy: 0.309333333333
prediction accuracy: 0.739666666667
prediction accuracy: 0.728666666667
prediction accuracy: 0.734666666667
prediction accuracy: 0.734333333333
prediction accuracy: 0.735666666667
prediction accuracy: 0.126666666667
prediction accuracy: 0.113
prediction accuracy: 0.176666666667
prediction accuracy: 0.058
0.058
