In [92]:
import numpy as np
import pandas as pd 



import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB


['train.csv', 'gender_submission.csv', 'test.csv']


In [93]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')


In [94]:
#Merged training and testing data for Data Exploration

titanic = train.append(test , ignore_index = True)
titanic.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(1309, 12)

In [95]:
titanic.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [96]:
# Re-defining Sex column

titanic['Sex_male'] = titanic.Sex.map({'female':0,'male':1})
titanic.drop('Sex',axis=1,inplace=True)
titanic.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,SibSp,Survived,Ticket,Sex_male
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,1,0.0,A/5 21171,1
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1.0,PC 17599,0
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,0,1.0,STON/O2. 3101282,0
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1.0,113803,0
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,0,0.0,373450,1


In [97]:
# Merfing 'Parch' and 'SibSp' to 'family' which is a more meaningful column

titanic['family'] = titanic.SibSp + titanic.Parch + 1
titanic.drop(['Parch','SibSp','Ticket','Fare','Cabin'],axis=1,inplace=True)

In [98]:
titanic.head()

Unnamed: 0,Age,Embarked,Name,PassengerId,Pclass,Survived,Sex_male,family
0,22.0,S,"Braund, Mr. Owen Harris",1,3,0.0,1,2
1,38.0,C,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1.0,0,2
2,26.0,S,"Heikkinen, Miss. Laina",3,3,1.0,0,1
3,35.0,S,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1.0,0,2
4,35.0,S,"Allen, Mr. William Henry",5,3,0.0,1,1


In [99]:
# Using the titles of 'Name' to determine the status of individual

titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}


# extract titles
titanic['Title'] = titanic.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# replace titles with a more common title or as Rare
titanic['Title'] = titanic['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr','Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
titanic['Title'] = titanic['Title'].replace('Mlle', 'Miss')
titanic['Title'] = titanic['Title'].replace('Ms', 'Miss')
titanic['Title'] = titanic['Title'].replace('Mme', 'Mrs')
# convert titles into numbers
titanic['Title'] = titanic['Title'].map(titles)
# filling NaN with 0, to get safe
titanic['Title'] = titanic['Title'].fillna(0)
titanic = titanic.drop(['Name'], axis=1)


In [100]:
# Dividing column 'Age' into Age groups which reveals more insight

titanic['Age'].fillna(value=titanic.Age.mean(),inplace=True)
titanic['Age'] = titanic.Age.astype(int)
titanic.loc[ titanic['Age'] <= 11, 'Age'] = 0
titanic.loc[(titanic['Age'] > 11) & (titanic['Age'] <= 18), 'Age'] = 1
titanic.loc[(titanic['Age'] > 18) & (titanic['Age'] <= 22), 'Age'] = 2
titanic.loc[(titanic['Age'] > 22) & (titanic['Age'] <= 27), 'Age'] = 3
titanic.loc[(titanic['Age'] > 27) & (titanic['Age'] <= 33), 'Age'] = 4
titanic.loc[(titanic['Age'] > 33) & (titanic['Age'] <= 40), 'Age'] = 5
titanic.loc[(titanic['Age'] > 40) & (titanic['Age'] <= 66), 'Age'] = 6
titanic.loc[ titanic['Age'] > 66, 'Age'] = 6



In [101]:
titanic.head()

Unnamed: 0,Age,Embarked,PassengerId,Pclass,Survived,Sex_male,family,Title
0,2,S,1,3,0.0,1,2,1
1,5,C,2,1,1.0,0,2,3
2,3,S,3,3,1.0,0,1,2
3,5,S,4,1,1.0,0,2,3
4,5,S,5,3,0.0,1,1,1


In [102]:
titanic.describe()

Unnamed: 0,Age,PassengerId,Pclass,Survived,Sex_male,family,Title
count,1309.0,1309.0,1309.0,891.0,1309.0,1309.0,1309.0
mean,3.588999,655.0,2.294882,0.383838,0.644003,1.883881,1.73262
std,1.734891,378.020061,0.837836,0.486592,0.478997,1.583639,1.022087
min,0.0,1.0,1.0,0.0,0.0,1.0,1.0
25%,2.0,328.0,2.0,0.0,0.0,1.0,1.0
50%,4.0,655.0,3.0,0.0,1.0,1.0,1.0
75%,5.0,982.0,3.0,1.0,1.0,2.0,2.0
max,6.0,1309.0,3.0,1.0,1.0,11.0,5.0


In [103]:
# Converting 'Embarked' to Numerical Values

titanic['Embarked_num'] = titanic.Embarked.map({'C':0,'Q':1,'S':2})
titanic.drop('Embarked',axis=1,inplace=True)

In [104]:
titanic['Embarked_num'].fillna(value=2,inplace=True)
titanic.describe()

Unnamed: 0,Age,PassengerId,Pclass,Survived,Sex_male,family,Title,Embarked_num
count,1309.0,1309.0,1309.0,891.0,1309.0,1309.0,1309.0,1309.0
mean,3.588999,655.0,2.294882,0.383838,0.644003,1.883881,1.73262,1.493506
std,1.734891,378.020061,0.837836,0.486592,0.478997,1.583639,1.022087,0.814244
min,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
25%,2.0,328.0,2.0,0.0,0.0,1.0,1.0,1.0
50%,4.0,655.0,3.0,0.0,1.0,1.0,1.0,2.0
75%,5.0,982.0,3.0,1.0,1.0,2.0,2.0,2.0
max,6.0,1309.0,3.0,1.0,1.0,11.0,5.0,2.0


In [105]:
titanic['Age'] = titanic.Age.astype(int)
#titanic['Survived'] = titanic.Survived.astype(int)
titanic['Embarked_num'] = titanic.Embarked_num.astype(int)

In [106]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
Age             1309 non-null int64
PassengerId     1309 non-null int64
Pclass          1309 non-null int64
Survived        891 non-null float64
Sex_male        1309 non-null int64
family          1309 non-null int64
Title           1309 non-null int64
Embarked_num    1309 non-null int64
dtypes: float64(1), int64(7)
memory usage: 81.9 KB


In [107]:
X_train = titanic.drop(["Survived","PassengerId"], axis=1).iloc[:891,:]
Y_train = titanic["Survived"].iloc[:891]
X_test  = titanic.drop(["Survived","PassengerId"], axis=1).iloc[891:,:]


In [108]:
#X_train = X_train.astype(float)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Age             891 non-null int64
Pclass          891 non-null int64
Sex_male        891 non-null int64
family          891 non-null int64
Title           891 non-null int64
Embarked_num    891 non-null int64
dtypes: int64(6)
memory usage: 41.8 KB


In [70]:
#logreg = LogisticRegression()
#logreg.fit(X_train, Y_train)

#Y_pred = logreg.predict(X_test).astype(int)

#acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

In [71]:
#acc_log

In [109]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test).astype(int)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

87.77

In [45]:
#random_forest = RandomForestClassifier(n_estimators=100)
#random_forest.fit(X_train, Y_train)

#Y_pred = random_forest.predict(X_test).astype(int)
#random_forest.score(X_train, Y_train)
#acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

In [46]:
#knn = KNeighborsClassifier(n_neighbors = 3)
#knn.fit(X_train, Y_train)
#Y_pred = knn.predict(X_test).astype(int)
#acc_knn = round(knn.score(X_train, Y_train) * 100, 2)


In [47]:
#acc_knn

In [110]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": Y_pred
    })

In [111]:
filename = 'Titanic Predictions 12 (Decision Trees).csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Titanic Predictions 12 (Decision Trees).csv
