In [88]:
import csv as csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  
from sklearn.preprocessing import MinMaxScaler
import scipy.optimize as opt  
from sklearn import metrics, linear_model, tree, ensemble


In [3]:
# Reading using numpy arrays

data=[] 
csv_file_object = csv.reader(open('./data/train.csv', 'r')) 
for row in csv_file_object:
    data.append(row)
data = np.array(data) 



In [4]:
# Reading using pandas

df = pd.read_csv('./data/train.csv', header=0)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [26]:
df['Gender'] = df['Sex'].map({'male': 1, 'female': 0}).astype(int)

In [27]:
median_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = df[(df['Gender'] == i) & \
                              (df['Pclass'] == j+1)]['Age'].dropna().median()
median_ages

array([[ 35. ,  28. ,  21.5],
       [ 40. ,  30. ,  25. ]])

In [28]:
df['AgeFill'] = df['Age']

In [29]:
df[ df['Age'].isnull() ][['Gender','Pclass','Age','AgeFill']].head()

Unnamed: 0,Gender,Pclass,Age,AgeFill
5,1,3,,
17,1,2,,
19,0,3,,
26,1,3,,
28,0,3,,


In [30]:
for i in range(0, 2):
    for j in range(0, 3):
        df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1),\
                'AgeFill'] = median_ages[i,j]
df[ df['Age'].isnull() ][['Gender','Pclass','Age','AgeFill']].head()

Unnamed: 0,Gender,Pclass,Age,AgeFill
5,1,3,,25.0
17,1,2,,30.0
19,0,3,,21.5
26,1,3,,25.0
28,0,3,,21.5


In [31]:
df['AgeIsNull'] = pd.isnull(df.Age).astype(int)

# Feature Engineering
df['FamilySize'] = df['SibSp'] + df['Parch']

# As we know Pclass made a lot of effect on survival
df['Age*Class'] = df.AgeFill * df.Pclass

#### Setting null embarked values to S as it is the most common occurence

In [33]:
df['EmbarkedIsNull'] = pd.isnull(df.Embarked).astype(int)

In [34]:
df.loc[df.Embarked.isnull(), 'Embarked'] = 'S'

In [35]:
df['EmbarkedInt'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [36]:
df.groupby('Survived').size()

Survived
0    549
1    342
dtype: int64

### Trying to recreate logistic regression as discussed in Andrew Ng's lecture

In [37]:
positive = df[df['Survived'].astype(bool)]
negative = df[~df['Survived'].astype(bool)]

In [38]:
def extract_variables(dFrame):
    X = np.array(dFrame.drop(['Survived'], axis=1).values)
    y = np.array(dFrame.Survived.values)
    return X, y

def scale(d):
    dFrame = d.copy(deep=True)
    scaler = MinMaxScaler()
    dFrame[['Pclass', 'Fare', 'AgeFill', 'Age*Class', 'SibSp', 'Parch', 'FamilySize', 'EmbarkedInt']] = scaler.fit_transform(dFrame[['Pclass', 'Fare', 'AgeFill', 'Age*Class', 'SibSp', 'Parch', 'FamilySize', 'EmbarkedInt']])
    return dFrame
    
def transform(d):
    dFrame = d.copy(deep=True)
    # Feature Engineering
    
    # Replacing Gender with Sex( binary mapping)
    dFrame['Gender'] = dFrame['Sex'].map({'male': 1, 'female': 0}).astype(int)
    ## calculating median of age and replacing all nulls
    dFrame['AgeFill'] = dFrame['Age'] 
    ## Storing where age is null
    dFrame['AgeIsNull'] = pd.isnull(dFrame.Age).astype(int)
    median_ages = np.zeros((2,3))
    for i in range(0, 2):
        for j in range(0, 3):
            median_ages[i,j] = dFrame[(dFrame['Gender'] == i) & \
                              (dFrame['Pclass'] == j+1)]['Age'].dropna().median()
    for i in range(0, 2):
        for j in range(0, 3):
            dFrame.loc[ (dFrame.Age.isnull()) & (dFrame.Gender == i) & (dFrame.Pclass == j+1),\
                'AgeFill'] = median_ages[i,j]

    # Additional features which me be predictive
    dFrame['FamilySize'] = dFrame['SibSp'] + dFrame['Parch']

    # As we know Pclass made a lot of effect on survival
    dFrame['Age*Class'] = dFrame.AgeFill * dFrame.Pclass
    
    dFrame['EmbarkedIsNull'] = pd.isnull(dFrame.Embarked).astype(int)
    dFrame.loc[dFrame.Embarked.isnull(), 'Embarked'] = 'S'
    dFrame['EmbarkedInt'] = dFrame['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

    dFrame = dFrame.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', "Age"], axis=1) 
    # Adding 0th column
    dFrame.insert(0, 'Ones', 1)
    return dFrame


### Train Data set and calculate accuracy

In [110]:
initialSet = pd.read_csv('./data/train.csv', header=0)
initialSet = transform(initialSet)
initialSet = scale(initialSet)

trainFrame = trainFrame[:791]
X,y  =  extract_variables(initialSet)
Xtrain,ytrain  =  extract_variables(trainFrame)
testFrame = initialSet[100:]
X_test,y_test =  extract_variables(testFrame)

In [111]:
def trainAndTest(model):
    model.fit(Xtrain, ytrain)
    predictions = model.predict(Xtrain)  
    name = type(model).__name__
    print(name)
    print("Train Accuracy {x}".format(x=metrics.accuracy_score(predictions, ytrain)))
    predictions_test = model.predict(X_test)  

    accuracy = metrics.accuracy_score(predictions_test, y_test)
    print("Test Accuracy {x}".format(x=accuracy))
    return {'accuracy': accuracy, 'model': model, 'name': name}


In [112]:
classifiers = [
    linear_model.LogisticRegressionCV(),
    tree.DecisionTreeClassifier(),
    ensemble.RandomForestClassifier(n_estimators=100, max_depth=100)
]
accuracy = 0
model = None
for classifier in classifiers: 
    result = trainAndTest(classifier)
    if(result['accuracy'] > accuracy):
        accuracy = result['accuracy']
        model = result['model']

LogisticRegressionCV
Train Accuracy 0.8040455120101138
Test Accuracy 0.8053097345132744
DecisionTreeClassifier
Train Accuracy 0.9823008849557522
Test Accuracy 0.9608091024020228
RandomForestClassifier
Train Accuracy 0.9823008849557522
Test Accuracy 0.9646017699115044


In [113]:
model = ensemble.RandomForestClassifier(n_estimators=100, max_depth=100)
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [114]:
init_final_frame = pd.read_csv('./data/test.csv', header=0)
finalFrame = init_final_frame.copy(deep=True)    
finalFrame.ix[152,'Fare'] = finalFrame.Fare.median()
finalFrame = transform(finalFrame)
finalFrame = scale(finalFrame)
Xfinal = np.array(finalFrame.values)
yPredicted = model.predict(Xfinal) 

In [115]:
output =  pd.DataFrame({'PassengerId': init_final_frame.PassengerId, 'Survived':yPredicted})

In [116]:
output.groupby('Survived').size()

Survived
0    286
1    132
dtype: int64

In [117]:
output.to_csv('./data/titanic_{0}_{1}.csv'.format(type(model).__name__, accuracy), index=False)