# Prediction using multiple models and cross-validation

Inspired by the following blogpost: http://www.bittenbypython.com/machine-learning-kaggle-titanic.html

In [36]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer, LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor

## Getting the data

We split the data into two:
1. The data (our X) without the outcome
2. The outcome (also known as "labels", or Y)

In [37]:
df = pd.read_csv('train.csv')
labels = df["Survived"]

The data (X) is transformed with the following operations:
- remove what we think are unecessary dimensions
- replace categorical data with binary data
- put values when some values or missing (NaN)
- normalize the data

It is useful to have a function to do this, as the test data will be passed through the same operations.

In [38]:
def to_binstd(df):
    # Drop some dimensions
    drops = ["PassengerId", "Name", "Ticket", "Cabin"]
    if "Survived" in df.columns:
        drops.append("Survived")
        
    df = df.drop(drops, axis=1)

    # Convert categorical data to binary data
    df = pd.get_dummies(df, columns=["Pclass", "Sex", "Embarked"])

    # Fix NaNs
    imputer = Imputer(strategy="median")
    X = imputer.fit_transform(df)
    df = pd.DataFrame(X, columns=df.columns)

    # Normalize data
    std = StandardScaler()
    X = std.fit_transform(df)
    df = pd.DataFrame(X, columns=df.columns)
    
    return df

df = to_binstd(df)

## Cross-validation

We will use cross-validation to have an idea of our model representation of the data.

In [39]:
def crossval(model, X, Y):
    model.fit(X, Y)
    # TODO check other scoring methods
    score = cross_val_score(model, X, Y, scoring="neg_mean_squared_error", cv=10)
    rmse = np.sqrt(-score)
    print("mean: {}, std: {}".format(rmse.mean(), rmse.std()))

## SVM

Create the the SVM model with the default SciKit parameters, then fit the data.

In [40]:
cls_svm = svm.SVC()
crossval(cls_svm, df, labels)

mean: 0.4142385677288476, std: 0.034919479912234534


## Linear Regression

In [41]:
linreg = LinearRegression()

crossval(linreg, df, labels)

mean: 0.3823559603081141, std: 0.0196987452041817


## Predictions on the test data

Now that our model have learned from the training data, we can ask them to make predictions about new data.

In [42]:
def write_predic(filepath, df, pred):
    with open(filepath, "w") as f:
        f.write("PassengerId,Survived\n")
        for i, j in zip(df.PassengerId, pred):
            j = 0 if j < 0.5 else 1
            f.write(f"{i},{j}\n")
            
dftest_init = pd.read_csv("test.csv")
dftest = to_binstd(dftest_init)

pred_svm = cls_svm.predict(dftest)
write_predic("/tmp/svm_predict.csv", dftest_init, pred_svm)

pred_linreg = linreg.predict(dftest)
write_predic("/tmp/linreg_predict.csv", dftest_init, pred_linreg)