# 1. Import Libraries

In [None]:
# Imports
import pandas as pd
from pandas import Series, DataFrame

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# ignore Deprecation Warning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# 2. Data Preprocessing
Look at each attribute. Missing values will be filled. Catagorital feastures will be transformed to numberical values.

In [None]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
df = train_df.append(test_df, ignore_index=True)
# Some quick inspections
print(train_df.shape, test_df.shape, train_df.columns.values)

In [None]:
#train_df.head()

In [None]:
#train_df.info()
#print('--------------------------')
#test_df.info()

In [None]:
#train_df.describe()

In [None]:
df.isnull().sum(axis=0)

Only "Age", "Cabin", "Embarked", "Fare", and "Survived" has null values

In [None]:
#df[df.isnull().any(axis=1)]

### Age

In [None]:
df.Age.isnull().sum()

There are 263 missing values in Age. This can be inferred from other feasures, e.g., Title, Fare, SibSp, Parch, etc. will come back to this after finishing inspecting other features. 

In [None]:
# get average, std, and number of NaN values in train_df
averge_age_df = df['Age'].mean()
std_age_df = df['Age'].std()
count_nan_age_df = df['Age'].isnull().sum()

# generate random numbers between (mean-std) & (mean+std)
rand_1 = np.random.randint(averge_age_df - std_age_df, averge_age_df + std_age_df, size=count_nan_age_df)

In [None]:
fig, (axis1, axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic')
# plot original Age values
# NOTE: drop all null values, and convert to int
df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)

# fill NaN values in Age column with random values generated
df['Age'][np.isnan(df['Age'])] = rand_1

# convert from float to int
df['Age'] = df['Age'].astype(int)
# plot new Age values
df['Age'].hist(bins=70, ax=axis2)

In [None]:
# peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(df, hue = 'Survived', aspect=4)
facet.map(sns.distplot, 'Age')
#facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0, df['Age'].max()))
facet.add_legend()

# Averge survived passeangers by age
fig, axis1 = plt.subplots(1, 1, figsize=(18,4))
average_age = df[['Age','Survived']].groupby(['Age'], as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)

### Cabin

In [None]:
# check if there is any NAN
df.Cabin.isnull().sum(axis=0)

This is highly imcomplete. There are two choices: (1)map the missing ones to a new category "unknown" (2) drop this feasure. 

In [None]:
df.drop('Cabin', axis=1, inplace=True)

### Embarked

In [None]:
# check if there is any NAN
df.Embarked.isnull().sum(axis=0)

In [None]:
df['Embarked'].value_counts()

In [None]:
df = df.fillna({'Embarked':'S'})

In [None]:
# plot
sns.factorplot('Embarked', 'Survived', data=df, size=2, aspect=2)

fig, (axis1, axis2, axis3) = plt.subplots(1,3,figsize=(15,2))
sns.countplot(x='Embarked', data=df, ax=axis1)
sns.countplot(x='Survived', hue='Embarked',data=df, order=[1,0],ax=axis2)

# group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = df[['Embarked','Survived']].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived',data=embark_perc, order=['S','C','Q'],ax=axis3)

In [None]:
# inspect the correlation between Embarked and Survived as well as some other features
#df[['Embarked', 'Survived','Pclass','Fare', 'Age', 'Sex']].groupby(['Embarked'], as_index=False).mean()
df[['Embarked', 'Survived','Pclass','Fare', 'Age']].groupby(['Embarked'], as_index=False).mean()

The survival rate changes with Embarked but it may actually due to other feasures, e.g., Pclass, Fare and Age. Thereby, this feature can be dropped. 

In [None]:
#  Remove 'S' dummy variable, and leave 'C' and 'Q', since they seem to have a good rate for survival. 
#embark_dummies_titanic = pd.get_dummies(train_df['Embarked'])
#embark_dummies_titanic.drop('S', axis=1, inplace=True)

#embark_dummies_test = pd.get_dummies(train_df['Embarked'])
#embark_dummies_test.drop('S', axis=1, inplace=True)

#train_df = train_df.join(embark_dummies_titanic)
#test_df = test_df.join(embark_dummies_test)

#train_df.drop(['Embarked'], axis=1,inplace=True)
#test_df.drop(['Embarked'], axis=1,inplace=True)

In [None]:
df.drop(['Embarked'], axis=1, inplace=True)

### Fare

In [None]:
# check if there is any NAN
df.Fare.isnull().sum(axis=0)

Only one missing Fare value. It can probably be inferred from Ticket, Pclass, Cabin, etc. Let's see the corresponding values of for these features.

In [None]:
print(df.Ticket[df.Fare.isnull()],df.Pclass[df.Fare.isnull()],df.Embarked[df.Fare.isnull()],df.Cabin[df.Fare.isnull()])

There is no corresponding value for Cabin, so only look at the relation between Fare and the other three feasures

In [None]:
fig, (axis1, axis2, axis3) = plt.subplots(1,3, sharex=False, figsize=(10,2))

Ticket_perc = df[['Ticket','Fare']].groupby(['Ticket'], as_index=False).mean()
sns.barplot(x='Ticket',y='Fare',data=Ticket_perc, ax=axis1)

Pclass_perc = df[['Pclass','Fare']].groupby(['Pclass'], as_index=False).mean()
sns.barplot(x='Pclass',y='Fare',data=Pclass_perc, ax=axis2)

Embarked_perc = df[['Embarked','Fare']].groupby(['Embarked'], as_index=False).mean()
sns.barplot(x='Embarked',y='Fare',data=Embarked_perc, ax=axis3)


In [None]:
# use boxplot to visualize the distribution of Fare for each Pclass
fig, (axis1, axis2, axis3) = plt.subplots(3,1, sharex=False, figsize=(10,8))
sns.boxplot('Ticket','Fare',data=df, ax=axis1)
axis1.set_ylim([0,300])
sns.boxplot('Pclass','Fare',data=df, ax=axis2)
axis2.set_ylim(0,300)
sns.boxplot('Embarked','Fare',data=df, ax=axis3)
axis3.set_ylim(0,300)

Looks like Fare does correlate with Ticket, Pclass & Embarked. Thereby I will guess the missing value using the median value of (Pcalss = 3) & (Ticket = 3) & (Embarked = S)

In [None]:
Fare_guess = df.Fare.loc[ (df.Ticket == '3') & (df.Pclass == 3) & (df.Embarked == 'S') ].median()
df.Fare.fillna(Fare_guess, inplace=True)

Now I will inspect the mean Fare values for people died and survived

In [None]:
facet = sns.FacetGrid(df, hue="Survived",aspect=1.5,size=2)
facet.map(plt.hist,'Fare', bins=range(0,210,10),alpha=0.5)
facet.add_legend()


In [None]:
# Get fare for survived & didn't survived passengers
fare_not_survived = df['Fare'][df['Survived'] == 0]
fare_survived = df['Fare'][df['Survived'] == 1]

# Get average and std for fare of survived/not survived passengers
average_fare = DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare = DataFrame([fare_not_survived.std(), fare_survived.std()])

# plot
df['Fare'].plot(kind='hist', figsize=[15,3],bins=100,xlim=(0,50))
average_fare.index.names = std_fare.index.names = ['Survived']
average_fare.plot(yerr=std_fare, kind='bar', legend=False)

In [None]:
# visualize the correlation between Fare and Survived using a scatter plot
df[['Fare','Survived']].groupby(['Fare'],as_index=False).mean().plot.scatter('Fare','Survived')

In [None]:
# bin Fare into five intervals with equal amount of people
df['Fare_bin'] = pd.qcut(df.Fare,5,labels=[1,2,3,4,5]).astype(int)

# inspect the correlation between Fare-bin and Survived
df[['Fare_bin', 'Survived']].groupby(['Fare_bin'], as_index=False).mean()

In [None]:
df[['Fare_bin','Survived']].groupby(['Fare_bin'],as_index=False).mean().plot.scatter('Fare_bin','Survived')
            

Now the correlation between Fare and Survived is clear after the binning

### Name

In [None]:
df.Name.head()

In [None]:
df['Title'] = df.Name.map( lambda x: x.split(',')[1].split( '.' )[0].strip())
# inspect the amount of people for each title
df.Name.map( lambda x: x.split(',')[1].split( '.' )[0].strip()).value_counts()

The main titles are: "Mr","Miss","Mrs" & "Master". Some of the others can be merged into one of the four categories. The rest will be merged into "Others"

In [None]:
df['Title'] = df['Title'].replace('Mlle','Miss')
df['Title'] = df['Title'].replace(['Mme','Lady','Ms'], 'Mrs')
df.Title.loc[ (df.Title != 'Master') & (df.Title != 'Mr') & (df.Title != 'Miss') & (df.Title != 'Mrs')] = 'Others'
# inspect the correlation between Title and Survived
#df[['Title','Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
# inspect the amount of people for each title
fig, (axis1, axis2) = plt.subplots(1,2, sharex=True, figsize=(10,2))
sns.countplot(x='Title', data=df,ax=axis1)
#sns.factorplot('Family', data=train_df, kind='count', ax=axis2)

# average of survived for those who had/didn't have any families
family_perc = df[['Title','Survived']].groupby(['Title'], as_index=False).mean()
sns.barplot(x='Title',y='Survived',data=family_perc, ax=axis2)
#axis1.set_xticklabels(['With Family','Alone'],rotation=0)

Now we can use dummy variables for these titles and drop the orginal columns. 

In [None]:
df = pd.concat([df, pd.get_dummies(df['Title'])], axis=1).drop(labels=['Name'], axis=1)

### Parch & SibSp

Instead of having 2 columns, Parch and SibSp, we can have only one column representing if the passenger had any families aboard. 

In [None]:
df['Family'] = df['Parch']+df['SibSp']+1
# inspect the correlation between Family and Survived
df[['Family', 'Survived']].groupby(['Family'], as_index=False).mean()

In [None]:
# the number of family members
fig, (axis1, axis2) = plt.subplots(1,2, sharex=True, figsize=(10,2))
sns.countplot(x='Family', data=df, ax=axis1)

# average of survived for those who had/didn't have any families
family_perc = df[['Family','Survived']].groupby(['Family'], as_index=False).mean()
sns.barplot(x='Family',y='Survived',data=family_perc, ax=axis2)
#axis1.set_xticklabels(['With Family','Alone'],rotation=0)

In [None]:
# inspect the family size
df['Family'].value_counts()

The survivial rate increases with the family size till Family>=5. However, the number of family with >=5 members is only 57. Besides, peole with big families (>=5) have a low survival rate too. Thereby, I will combine the data with Family>4 into one category, Family=5 

In [None]:
df.Family = df.Family.map(lambda x: 5 if x>4 else x)
df[['Family','Survived']].groupby(['Family'], as_index=False).mean()

In [None]:
# the number of family members
fig, (axis1, axis2) = plt.subplots(1,2, sharex=True, figsize=(10,2))
sns.countplot(x='Family', data=df, ax=axis1)

# average of survived for those who had/didn't have any families
family_perc = df[['Family','Survived']].groupby(['Family'], as_index=False).mean()
sns.barplot(x='Family',y='Survived',data=family_perc, ax=axis2)
#axis1.set_xticklabels(['With Family','Alone'],rotation=0)

In [None]:
# Drop "Parch" & "SibSp"
#df = df.drop(['Parch','SibSp'], axis=1)
#df = df.drop(['Parch','SibSp'], axis=1)

### PassengerId

### Pclass

In [None]:
# check if there are any null values
df['Pclass'].isnull().sum(axis=0)

In [None]:
# inspect the corrleatoin between Pclass and survived
#df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()
sns.factorplot('Pclass','Survived', order=[1,2,3],data=train_df,size=3)

We can see that a higher class has a higher survival rate. 

Create dummy variable for Pclass & drop the 3 rd class as it has the lowest average of survived passengers

In [None]:
pclass_dummies_df = pd.get_dummies(df['Pclass'])
pclass_dummies_df.columns = ['Class1','Class2','Class3']
#pclass_dummies_df.drop(['Class3'], axis=1, inplace=True)

#df.drop(['Pclass'], axis=1, inplace=True)

df = df.join(pclass_dummies_df)



In [None]:
# Drop useless columns
#train_df = train_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
#test_df = test_df.drop(['Name','Ticket'], axis=1)

In [None]:
# Fill in NaN values of Age with the mean value



### Sex

As we see, children (age<16) on aboard eem to have a high chances for survival. so we can classify passengers as males, females, and child

In [None]:
df.Sex.isnull().sum(axis=0)

In [None]:
df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

In [None]:
# map the two genders to 0 and 1
df.Sex = df.Sex.map({'male':0, 'female':1})

In [None]:
# inspect the correlation between Sex and Survived 
fig, (axis1, axis2) = plt.subplots(1,2, sharex=True, figsize=(10,2))
sns.countplot(x='Sex', data=df, order=[1,0],ax=axis1)

sex_perc = df[['Sex','Survived']].groupby(['Sex'], as_index=False).mean()
sns.barplot(x='Sex',y='Survived',data=sex_perc, ax=axis2)

In [None]:
'''
def get_person(passenger):
    age, sex = passenger
    return 'child' if age < 16 else sex

df['Person'] = df[['Age','Sex']].apply(get_person, axis=1)

df['Person'].head()

# No need to use sex col since we created person col
df.drop(['Sex'],axis=1,inplace=True)


# Create dummy variables for Person column, & drop male as it has the lowest average of surved passengers
person_dummies_df = pd.get_dummies(df['Person'])
#person_dummies_df.columns = ['Child','Female','Male']
#person_dummies_df.drop(['Male'],axis=1, inplace=True)

#df = df.join(person_dummies_df)

# map the two genders to 0 and 1
df.Sex = df.Sex.map({'male':0, 'female':1})

'''


### Ticket

In [None]:
df.Ticket.isnull().sum(axis=0)

In [None]:
df.Ticket.head(10)

It seems like there are two types of tickects: (1)number only (2) letter+number. Tickets start with letters probably represent some special classes, and the first digit of the numbers may prepresent the class. 

In [None]:
df.Ticket = df.Ticket.map(lambda x:x[0])
df['Ticket'].value_counts()

In [None]:
# inspect the amount of people for each type of tickets
#df['Ticket'].value_counts()
fig, (axis1, axis2) = plt.subplots(1,2, sharex=True, figsize=(10,2))
sns.countplot(x='Ticket', data=df, ax=axis1)
# inspect the correlatin between Ticket and Survived
#df[['Ticket','Survived']].groupby(['Ticket'], as_index=False).mean()
sex_perc = df[['Ticket','Survived']].groupby(['Ticket'], as_index=False).mean()
sns.barplot(x='Ticket',y='Survived',data=sex_perc, ax=axis2)

We can see here that the majority of the tickest are "1","2","3", and their survival rates are "1">"2">"3". The rates for others are low, except for "9"(2),"C"(77),"F"(13),"P"(98), and "S"(98). "9""F" are very small samples. The high rates here are probably from Pclass or Fase. Let's check. 

In [None]:
#df[['Ticket', 'Fare']].groupby(['Ticket'], as_index=False).mean()
#df[['Ticket', 'Pclass']].groupby(['Ticket'], as_index=False).mean()

fig, (axis1, axis2) = plt.subplots(1,2, sharex=True, figsize=(10,2))

fare_perc = df[['Ticket','Fare']].groupby(['Ticket'], as_index=False).mean()
sns.barplot(x='Ticket',y='Fare',data=fare_perc, ax=axis1)
Pclass_perc = df[['Ticket','Pclass']].groupby(['Ticket'], as_index=False).mean()
sns.barplot(x='Ticket',y='Pclass',data=Pclass_perc, ax=axis2)

# 4. Modeling and Prediction

Now we can drop useless features and split the data into training and testing sets. Then i will use various models and make predictions.

In [None]:
df.head()

In [None]:
df.drop(['PassengerId','Parch','SibSp','Title'], axis=1, inplace=True)

X_train = df[0:891].drop(['Survived'], axis=1).values
Y_train = df[0:891]['Survived'].values
X_test  = df[891:].drop(['Survived'], axis=1).values


In [None]:
# define training and testing sets
#X_train = train_df.drop('Survived', axis=1)
#Y_train = train_df['Survived']
#X_test = test_df.drop('PassengerId', axis=1).copy()
#X_train

In [None]:
# Logistic Regress
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
#logreg.score(X_train, Y_train)
logreg.score(X_train, Y_train)


In [None]:
# SVM

In [None]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
svc.score(X_train,Y_train)

In [None]:
# Ramdom Forests
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)
rf.score(X_train,Y_train)

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
knn.score(X_train, Y_train)

In [None]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
gaussian.score(X_train, Y_train)

In [None]:
# get Correlation Coefficient for each feasure using Logistic Regression
coeff_df = DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Features']
coeff_df['Coefficient Estimate'] = pd.Series(logreg.coef_[0])
coeff_df

# 5 Submission

In [None]:
submission = pd.DataFrame({ "PassengerId": test_df["PassengerId"], "Survived": Y_pred})
submission.to_csv('titanic.csv', index=False)