In [2]:
# data manipulation
import pandas as pd
import numpy as np
import re

# plotting
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
from ggplot import *

# machine learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation

####Missing Age Values (Linear Regression)

In [46]:
train = pd.read_csv('train.csv')

In [41]:
X_train = train.ix[train['Age'].notnull(), ['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Sex']]
X_train['Sex'] = X_train['Sex'].factorize()[0]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_train.ix[:, 1:], X_train.ix[:, 0], test_size=0.2)
age_model = LinearRegression()
age_model.fit(X_train, y_train)
print 'score: %s' % age_model.score(X_train, y_train)
prediction = pd.DataFrame({
        'actual_age': y_test,
        'predicted_age': age_model.predict(X_test)
})

score: 0.243062591385


In [60]:
class_sex_fare = train.groupby(['Pclass', 'Sex'])['Fare'].mean()
def get_fare(row):
    pclass, sex, fare = row
    if pd.isnull(fare):
        return class_sex_fare[pclass][sex]
    return fare

def preprocess_df(df):
    """Prepares dataset for prediction"""
    
    # fill missing values
    df['SexFill'] = df['Sex']
    df['FareFill'] = df[['Pclass', 'SexFill', 'Fare']].apply(get_fare, axis=1)
    df['AgeIsNull'] = df['Age'].isnull().astype(int)
    df['AgeFill'] = df['Age']
    if df['AgeFill'].isnull().any():
        age_data = df.ix[df['AgeFill'].isnull(), ['Pclass', 'SibSp', 'Parch', 'Fare', 'Sex']]
        age_data['Sex'] = age_data['Sex'].factorize()[0]
        age_predictions = age_model.predict(age_data)
        df.ix[df['AgeFill'].isnull(), 'AgeFill'] = age_predictions
    
    # categorical vars
    dummies_test_sex = pd.get_dummies(df['SexFill'])
    
    # remove unnecessary columns
    df = df[['Pclass', 'FareFill', 'AgeFill', 'AgeIsNull']].join(dummies_test_sex)
    
    return df
# fit
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train.ix[:, 1:], train['Survived'],
                                                                     test_size=0.2, random_state=42)
X_train = preprocess_df(X_train)
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
# validate
X_test = preprocess_df(X_test)
model.score(X_test, y_test)

0.82122905027932958

In [48]:
sorted(zip(X_train.columns, model.feature_importances_), key=lambda x: x[1], reverse=True)

[('FareFill', 0.27248105734936906),
 ('AgeFill', 0.26513912098743331),
 ('male', 0.24740697231636158),
 ('Pclass', 0.10879424091607097),
 ('female', 0.088458591460187461),
 ('AgeIsNull', 0.017720016970577689)]

#### Predict

In [61]:
test = pd.read_csv('test.csv')
test_data = preprocess_df(test)
predictions = model.predict(test_data)
test['Survived'] = predictions
test[['PassengerId', 'Survived']].to_csv('predictions.csv', index=False)