In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder

train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'

titanic_train_data = pd.read_csv(train_data_path)
titanic_test_data = pd.read_csv(test_data_path)

In [3]:
titanic_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# keep only the necessary column values
features = ['Pclass','Sex','Age','SibSp','Parch','Fare']
labels = ['Survived']
train_set = titanic_train_data[features]
test_set = titanic_test_data[features]

In [6]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

## set(train_set['Sex)) = {male,female} need to conver to binary
lb_make = LabelEncoder()
train_set.loc[:,'Sex_bin'] = lb_make.fit_transform(train_set['Sex'])
test_set.loc[:,'Sex_bin'] = lb_make.fit_transform(test_set['Sex'])

## alternative approach using built in pandas functionality
# train_set['Sex_bin'] = train_set['Sex'].astype('category').cat.codes
# test_set['Sex_bin] = test_set['Sex].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [7]:
## convert Pclass to be one-hot-encoded. 
lb_binarizer = LabelBinarizer()
Pclass_binarize_results_train = lb_binarizer.fit_transform(train_set['Pclass'])
Pclass_binarize_results_test = lb_binarizer.fit_transform(test_set['Pclass'])
Pclass_labels = ['class_{}'.format(c) for c in lb_binarizer.classes_]

Pclass_df_train = pd.DataFrame(Pclass_binarize_results_train, columns= Pclass_labels)
Pclass_df_test = pd.DataFrame(Pclass_binarize_results_test, columns= Pclass_labels)

training_data = pd.concat([train_set,Pclass_df_train], axis= 1)
testing_data = pd.concat([test_set, Pclass_df_test], axis = 1)

In [8]:
## fill in missing age data
def age_conversion(df):
    mean_age = df['Age'].mean()
    std_age = df['Age'].std()
    num_miss = df['Age'].isnull().sum()
    
    age_fill = np.random.normal(mean_age, std_age, num_miss)
    
    if np.isnan(df['Age']).sum(axis= 0) > 0:
        df.loc[np.isnan(df['Age']),'Age'] = np.abs(age_fill)
        
    return df, age_fill

training_data, xx = age_conversion(training_data)
# testing_data = age_conversion(training_data)

In [9]:
# remove not need columns
columns_to_drop = ['Pclass','Sex']
training_data.drop(columns_to_drop, axis= 1, inplace= True)
testing_data.drop(columns_to_drop, axis= 1, inplace= True)