In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'

titanic_temp_data = pd.read_csv(train_data_path)
titanic_test_data = pd.read_csv(test_data_path)

titanic_train_data, titanic_val_data = train_test_split(
                                            titanic_temp_data, test_size = .2)



In [2]:
titanic_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
635,636,1,2,"Davis, Miss. Mary",female,28.0,0,0,237668,13.0,,S
844,845,0,3,"Culumovic, Mr. Jeso",male,17.0,0,0,315090,8.6625,,S
778,779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
849,850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C
490,491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S


In [157]:
# keep only the necessary column values
features = ['Pclass','Sex','Age','SibSp','Parch','Fare']
labels = ['Survived']

train_set = titanic_train_data[features].reset_index(drop=True)
val_set = titanic_val_data[features].reset_index(drop=True)
test_set = titanic_test_data[features].reset_index(drop=True)

train_labels = titanic_train_data[labels].reset_index(drop=True)
val_labels = titanic_val_data[labels].reset_index(drop=True)

In [158]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler

## set(train_set['Sex)) = {male,female} need to conver to binary
lb_make = LabelEncoder()
train_set.loc[:,'Sex_bin'] = lb_make.fit_transform(train_set['Sex'])
val_set.loc[:,'Sex_bin'] = lb_make.fit_transform(val_set['Sex'])
test_set.loc[:,'Sex_bin'] = lb_make.fit_transform(test_set['Sex'])

## alternative approach using built in pandas functionality
# train_set['Sex_bin'] = train_set['Sex'].astype('category').cat.codes
# test_set['Sex_bin] = test_set['Sex].astype('category').cat.codes

In [159]:
## convert Pclass to be one-hot-encoded. 
lb_binarizer = LabelBinarizer()
Pclass_binarize_results_train = lb_binarizer.fit_transform(train_set['Pclass'])
Pclass_binarize_results_val = lb_binarizer.fit_transform(val_set['Pclass'])
Pclass_binarize_results_test = lb_binarizer.fit_transform(test_set['Pclass'])
Pclass_labels = ['class_{}'.format(c) for c in lb_binarizer.classes_]

Pclass_df_train = pd.DataFrame(Pclass_binarize_results_train, columns=Pclass_labels)
Pclass_df_val = pd.DataFrame(Pclass_binarize_results_val, columns=Pclass_labels)
Pclass_df_test = pd.DataFrame(Pclass_binarize_results_test, columns=Pclass_labels)


training_data = pd.concat([train_set,Pclass_df_train], axis=1)
val_data = pd.concat([val_set,Pclass_df_val], axis=1)
testing_data = pd.concat([test_set, Pclass_df_test], axis=1)

In [160]:
## fill in missing age data
def nan_conversion(df,column):
    mean_age = df[column].mean()
    median_age = df[column].median()
    std_age = df[column].std()
    num_miss = df[column].isnull().sum()
    
    # fill age by draws from a normal distribution defined by mean and std
    age_fill = np.random.normal(mean_age, std_age, num_miss)
    
    if np.isnan(df[column]).sum(axis= 0) > 0:
        df.loc[np.isnan(df[column]),column] = np.abs(age_fill)
        
    ## another option is to just fillna with median or mean values
#     df['Age'].fillna(median_age, inplace=True)
#     df['Age'].fillna(mean_age, inplace=True) 

    # there are other options such as drawing from uniform dist bounded by mean-std, mean+std
    return df

training_data = nan_conversion(training_data, 'Age')
val_data = nan_conversion(val_data, 'Age')
testing_data = nan_conversion(testing_data, 'Age')
testing_data = nan_conversion(testing_data, 'Fare')

In [161]:
# remove not need columns
columns_to_drop = set(['Pclass','Sex'])
if bool(columns_to_drop.intersection(set(training_data.columns))):
    training_data.drop(columns_to_drop, axis=1, inplace=True)
if bool(columns_to_drop.intersection(set(val_data.columns))):
    val_data.drop(columns_to_drop, axis=1, inplace=True)
if bool(columns_to_drop.intersection(set(testing_data.columns))):
    testing_data.drop(columns_to_drop, axis=1, inplace=True)

In [162]:
x_train = training_data.values
x_val = val_data.values
x_test = testing_data.values

y_train = train_labels.values
y_val = val_labels.values

In [163]:
## Through testing learned that testing has a missing value for fare. 
scaler = StandardScaler()
scaled_x_trian = scaler.fit_transform(x_train)
scaled_x_val = scaler.fit_transform(x_val)
scaled_x_test = scaler.fit_transform(x_test)

In [177]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score


In [180]:
y_pred = cross_val_predict(RandomForestClassifier(), x_train, y_train.reshape(712,), cv=3)

In [181]:
accuracy_score(y_train,y_pred)

0.8160112359550562