In [1]:
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
# get titanic & test csv files as a DataFrame
titanic_df = pd.read_csv("./titan/train.csv")
test_df    = pd.read_csv("./titan/test.csv")
# preview the data
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_df.info()
print("----------------------------")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
----------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare

In [4]:
titanic_df = titanic_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test_df    = test_df.drop(['Name','Ticket','Cabin'], axis=1)

In [5]:
titanic_df['Embarked']=titanic_df['Embarked'].fillna('S')
test_df['Embarked']=titanic_df['Embarked'].fillna('S')

In [6]:
sex = {'male': 1, 'female': 2}
titanic_df['Sex']=titanic_df['Sex'].map(lambda x: sex[x])
test_df['Sex']=test_df['Sex'].map(lambda x: sex[x])

In [7]:
def age(x):
    if x < 15:
        return 1
    elif x >= 15 and x < 30:
        return 2
    elif x >= 30 and x < 45:
        return 3
    else:
        return 4
titanic_df['Age']=titanic_df['Age'].map(lambda x: age(x))
test_df['Age']=titanic_df['Age'].map(lambda x: age(x))
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,2,1,0,7.25,S
1,1,1,2,3,1,0,71.2833,C
2,1,3,2,2,0,0,7.925,S
3,1,1,2,3,1,0,53.1,S
4,0,3,1,3,0,0,8.05,S


In [8]:
titanic_df['family']=titanic_df['SibSp']+titanic_df['Parch']
titanic_df=titanic_df.drop(['SibSp', 'Parch'], axis=1)
test_df['family']=test_df['SibSp']+test_df['Parch']
test_df=test_df.drop(['SibSp', 'Parch'], axis=1)
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,family
0,0,3,1,2,7.25,S,1
1,1,1,2,3,71.2833,C,1
2,1,3,2,2,7.925,S,0
3,1,1,2,3,53.1,S,1
4,0,3,1,3,8.05,S,0


In [9]:
disc={'S':1, 'C':2, 'Q': 3}
titanic_df['Embarked']=titanic_df['Embarked'].map(lambda x:disc[x])
test_df['Embarked']=test_df['Embarked'].map(lambda x:disc[x])
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,family
0,0,3,1,2,7.25,1,1
1,1,1,2,3,71.2833,2,1
2,1,3,2,2,7.925,1,0
3,1,1,2,3,53.1,1,1
4,0,3,1,3,8.05,1,0


In [10]:
X_train = titanic_df.drop("Survived", axis=1)
Y_train = titanic_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 6), (891,), (418, 6))

In [19]:
X_test['Fare']=X_test['Fare'].fillna(X_train['Fare'].median())

In [20]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

81.03

In [21]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

85.41

In [22]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

86.42

In [23]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

94.73

In [25]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('./output/submission.csv', index=False)