In [None]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

data_path = "/home/students/s407535/Desktop/titanic"

train_data_path = "{}/train/train.tsv".format(data_path)
train_data = pd.read_csv(train_data_path, delimiter='\t')

valid_X = pd.read_csv("{}/dev-0/in.tsv".format(data_path),
                           delimiter='\t',
                           names=train_data.columns[1:])
valid_y = pd.read_csv("{}/dev-0/expected.tsv".format(data_path),
                           delimiter='\t',
                           names=train_data.columns[:1])

train_data.ix[:,0].mean()

train_y = train_data.ix[:,0]
train_X = train_data.ix[:,1:]

train_X.head()

sex_dict = {'male': 0, 'female': 1}
train_X['Sex']=train_X['Sex'].map(sex_dict)
valid_X['Sex']=valid_X['Sex'].map(sex_dict)

train_X=train_X.drop(['PassengerId'], axis=1)
valid_X=valid_X.drop(['PassengerId'], axis=1)

emb_dict = {'S': 0, 'C': 1, 'Q': 2}
train_X.Embarked = train_X.Embarked.fillna('S')
valid_X.Embarked = valid_X.Embarked.fillna('S')

train_X['Embarked']=train_X['Embarked'].map(emb_dict)
valid_X['Embarked']=valid_X['Embarked'].map(emb_dict)

data = [train_X, valid_X]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    dataset['Title'] = dataset['Title'].map(titles)

    dataset['Title'] = dataset['Title'].fillna(0)
    
train_X=train_X.drop(['Ticket'], axis=1)
train_X=train_X.drop(['Name'], axis=1)
valid_X=valid_X.drop(['Ticket'], axis=1)
valid_X=valid_X.drop(['Name'], axis=1)

import re
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
data = [train_X, valid_X]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int) 
    
train_X = train_X.drop('Cabin', axis=1)
valid_X = valid_X.drop('Cabin', axis=1)

train_X['Age']=train_X['Age'].fillna(train_X['Age'].mean())
valid_X['Age']=valid_X['Age'].fillna(valid_X['Age'].mean())

from sklearn.metrics import accuracy_score

model1 = LogisticRegression()
model1.fit(train_X, train_y)
valid_predicted_1=model1.predict(valid_X)
print(accuracy_score(valid_y, valid_predicted_1))

model2 = KNeighborsClassifier()
model2.fit(train_X, train_y)
valid_predicted_2=model2.predict(valid_X)
print(accuracy_score(valid_y, valid_predicted_2))

model3 = DecisionTreeClassifier()
model3.fit(train_X, train_y)
valid_predicted_3=model3.predict(valid_X)
print(accuracy_score(valid_y, valid_predicted_3))

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train_X, train_y)

valid_predicted=random_forest.predict(valid_X)
print(accuracy_score(valid_y, valid_predicted))

test_predicted=random_forest.predict(test_X)
pd.Series(valid_predicted.flatten()).to_csv("{}/dev-0/out.tsv".format(data_path),
                                            sep='\t',
                                            index=None)

pd.Series(test_predicted.flatten()).to_csv("{}/test-A/out.tsv".format(data_path),
                                           sep='\t',
                                           index=None) 