## Cross Validation

In [1]:
import os
import pandas as pd
import numpy as np
import pylab as pl

from time import time
from IPython.core.display import Image

# Scikit-learn:
# # Model:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# # PCA:
from sklearn.decomposition import PCA, RandomizedPCA

# # Metrics:
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, f_classif

In [2]:
df = pd.read_csv('C:/Vindico/Projects/Data/Kaggle/Competition/Titanic/train.csv')

df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

age_mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean)

from scipy.stats import mode

mode_embarked = mode(df['Embarked'])[0][0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)

pd.get_dummies(df['Embarked'], prefix='Embarked').head(10)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

df = df.drop(['Sex', 'Embarked'], axis=1)

cols = df.columns.tolist()
cols = [cols[1]] + cols[0:1] + cols[2:]

df = df[cols]

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [3]:
train_data = df.values[:]

X = train_data[:, 2:]
y = train_data[:, 0]

In [4]:
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_pred = y.copy()
    
    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred


def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

In [6]:
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))

print "Logistic Regression:"
print "%.3f" % accuracy(y, run_cv(X,y,LogisticRegression))

print "Gradient Boosting Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,GradientBoostingClassifier))

print "Bagging Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,BaggingClassifier))

print "Extra Trees Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,ExtraTreesClassifier))

print "Decision Tree Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,DecisionTreeClassifier))

print "K-Neighbors Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,KNeighborsClassifier))

print "Random Forest Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,RandomForestClassifier))

print "AdaBoost Classifier:"
print "%.3f" % accuracy(y, run_cv(X,y,AdaBoostClassifier))

Support vector machines:
0.709
Logistic Regression:
0.797
Gradient Boosting Classifier:
0.825
Bagging Classifier:
0.802
Extra Trees Classifier:
0.797
Decision Tree Classifier:
0.782
K-Neighbors Classifier:
0.701
Random Forest Classifier:
0.811
AdaBoost Classifier:
0.811


In [7]:
cv = KFold(n=len(train_data), n_folds=10)

for training_set, test_set in cv:
    X_train = X[training_set]
    y_train = y[training_set]
    X_test = X[test_set]
    y_test = y[test_set]
    model = GradientBoostingClassifier(min_samples_split=2)
    model.fit(X_train, y_train)
    y_prediction = model.predict(X_test)
    print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)
print np.mean(y_test == y_prediction)

prediction accuracy: 0.788888888889
prediction accuracy: 0.808988764045
prediction accuracy: 0.797752808989
prediction accuracy: 0.831460674157
prediction accuracy: 0.85393258427
prediction accuracy: 0.831460674157
prediction accuracy: 0.820224719101
prediction accuracy: 0.775280898876
prediction accuracy: 0.887640449438
prediction accuracy: 0.865168539326
0.865168539326
