In [183]:
#imports
import pandas as pd
import numpy as np
import csv

In [184]:
#read files
test1 = pd.read_csv('data/test.csv')
train1 = pd.read_csv('data/train.csv')

In [185]:
#clean data
def drop_cols(df):
    return df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    
def fill_missing_age(df):
    fill_value = df.Age.median()
    df.loc[df.Age.isnull(),'Age'] = fill_value
    return df
    
def fill_missing_embarked(df):
    fill_value = df.Embarked.dropna().mode().values
    df.Embarked[df.Embarked.isnull()] = fill_value
    return df

def fill_missing_fare(df):
    fares = {1:0,2:0,3:0}
    for c in range(1,4):
        fares[c] = train.loc[train.Pclass == c, 'Fare'].median()
    for c in range(1,4):
        df.loc[df.Fare.isnull() & df.Pclass == c, 'Fare'] = fares[c]
    return df

def transform_sex(df):
    df.Sex = df.Sex.map( {'female': 0, 'male': 1} ).astype(int)
    return df

    
def transform_embarked(df):
    df['Emb_S'] = (df['Embarked']=='S')*1
    df['Emb_C'] = (df['Embarked']=='C')*1
    df = df.drop('Embarked', axis=1)
    return df

def transform_pclass(df):
    df['Pclass_1'] = (df['Pclass']== 1)*1
    df['Pclass_2'] = (df['Pclass']== 2)*1
    df = df.drop('Pclass',axis=1)
    return df


#check count of null
#train.isnull().sum()

#Pclass and Embarked > create binary categorical features
#age, sibsp, parch, fare > normalize
#sex > binary

In [186]:
def mynormalize(data, mean, std):
    z_data = (data-mean)/std
    norm_data = (z_data-np.min(z_data))/(np.max(z_data) - np.min(z_data))
    return norm_data

def mean_std_calculator(data):
    return np.mean(data), np.std(data)

def norm_train_test(feature):
    mean, std = mean_std_calculator(train[feature])
    train[feature] = mynormalize(train[feature], mean, std)
    test[feature] = mynormalize(test[feature], mean, std)
    # normalize test data from training mean and std

In [187]:
#perform data transforms
train = train1
test = test1


#drop 'Name', 'Ticket', 'Cabin'
train = drop_cols(train)
test = drop_cols(test)


#fill missing values
train = fill_missing_age(train)
test = fill_missing_age(test)

train = fill_missing_embarked(train)
test = fill_missing_fare(test)


# create binary discrete features
train = transform_embarked(train)
test = transform_embarked(test)

train = transform_pclass(train)
test = transform_pclass(test)

train = transform_sex(train)
test = transform_sex(test)


# normalize continous features
continous_features = np.array(['Age', 'SibSp', 'Parch', 'Fare'])
map(norm_train_test, continous_features)




A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[None, None, None, None]

In [188]:
#accuracy
def accuracy(y, y_hat):
    correct = np.sum(y == y_hat)
    total = len(y)
    return correct*100.0/total

In [189]:
#create output file
def create_output(ids, predictions, filename):
    with open(filename, 'wb') as output_file:
        csv_writer = csv.writer(output_file)
        csv_writer.writerow(['PassengerId','Survived'])
        csv_writer.writerows(zip(ids, predictions))

In [190]:
#train model and creat output
def train_model_and_predict(clf, outputfile):
    clf.fit(x_train, y_train)
    y_hat_train = clf.predict(x_train)
    train_acc = accuracy(y_train, y_hat_train)
    y_hat_test = clf.predict(test.ix[:,1:])
    create_output(test.PassengerId, y_hat_test, outputfile)
    return train_acc

In [191]:
#data
x_train = train.ix[:,2:]
x_test = test.ix[:,1:]

y_train = train.Survived

# model 1 > 0.73684

In [197]:
#model
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(n_estimators=1000)

outputfile = 'data/output/output_1.csv'
train_model_and_predict(model1, outputfile)

97.979797979797979

# model 2 > 0.76555

In [201]:
from sklearn.ensemble import AdaBoostClassifier

model2 = AdaBoostClassifier(base_estimator=None, 
                            n_estimators=1000, 
                            learning_rate=0.1, 
                            algorithm='SAMME.R', 
                            random_state=23)

outputfile = 'data/output/output_5.csv'
train_model_and_predict(model2, outputfile)

82.491582491582491

# model 3 > 0.75598

In [199]:
from sklearn.naive_bayes import GaussianNB
model3 = GaussianNB()

outputfile = 'data/output/output_3.csv'
train_model_and_predict(model3, outputfile)

78.787878787878782

#scratch