In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
predictions = pd.read_csv('gender_submission.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
data = [train_data,test_data]

# Preprocessing

In [6]:
for dataset in data:
    dataset.info()
    print(".-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         

#### Dropping Unusable

In [7]:
for dataset in data:
    dataset.drop(['PassengerId','Cabin','Ticket'],axis=1,inplace=True)

#### Preprocessing Names

In [8]:
for dataset in data:
    initials = []
    for i in list(dataset["Name"].str.split(", ")):
        initials.append(i[1][:4])
    dataset["Name"] = initials

In [9]:
data[0].Name.value_counts()

Mr.     517
Miss    182
Mrs.    125
Mast     40
Dr.       7
Rev.      6
Mlle      2
Col.      2
Majo      2
Jonk      1
Capt      1
Mme.      1
Lady      1
the       1
Ms.       1
Sir.      1
Don.      1
Name: Name, dtype: int64

In [10]:
data[1].Name.value_counts()

Mr.     240
Miss     78
Mrs.     72
Mast     21
Col.      2
Rev.      2
Dona      1
Ms.       1
Dr.       1
Name: Name, dtype: int64

In [11]:
for dataset in data:
    dataset['Name'] = dataset['Name'].replace(['Majo','the ','Dr. '], 'Rare')
    dataset['Name'] = dataset['Name'].replace(['Mlle','Mme.','Ms. '],'Miss')
    dataset['Name'] = dataset['Name'].replace(['Mme.','Lady','Dona'],'Mrs.')
    dataset['Name'] = dataset['Name'].replace(['Capt','Sir.','Don.','Col.','Jonk','Rev.',],'Mr. ')

In [12]:
pd.crosstab(data[0].Name, data[0].Sex)

Sex,female,male
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mast,0,40
Miss,186,0
Mr.,0,529
Mrs.,126,0
Rare,2,8


In [13]:
title = {'Mast': 4, 'Miss': 3, 'Mr. ': 2,'Mrs.':1, 'Rare': 0}
for dataset in data:
    dataset['Name'] = dataset['Name'].map( title ).astype(int)

#### Preprocessing Embarked

In [14]:
data[0][['Embarked','Survived']].groupby(['Embarked']).mean()

Unnamed: 0_level_0,Survived
Embarked,Unnamed: 1_level_1
C,0.553571
Q,0.38961
S,0.336957


In [15]:
Embarked = {'S': 2,'Q':1, 'C': 0}
for dataset in data:
    dataset['Embarked'].fillna("S",inplace=True)
    dataset['Embarked'] = dataset['Embarked'].map( Embarked ).astype(int)

#### Preprocessing Age

In [16]:
train_data_ = train_data.transpose()
avn = train_data[['Name','Age']].groupby(['Name']).mean().transpose()
avn

Name,0,1,2,3,4
Age,42.444444,36.009174,32.815854,21.86,4.574167


In [17]:
for dataset in data:
    dataset_ = dataset.transpose()
    for i in range(len(dataset)):
        if(not dataset_[i]['Age'] > 0):
            dataset_[i]['Age'] = avn[dataset_[i]['Name']]['Age']
    dataset_ = dataset_.transpose()
    new_Age = dataset_['Age'].values
    dataset['Age'] = np.float64(new_Age)
    dataset.info()
    print(".-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(6), object(1)
memory usage: 62.7+ KB
.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Pclass      418 non-null int64
Name        418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
Embarked    418 non-null int64
dtypes: float64(2), int64(5), object(1)
memory usage: 26.2+ KB
.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-


In [18]:
for dataset in data:
    dataset.loc[dataset['Age'] <= 18, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 35), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 35) & (dataset['Age'] <= 64), 'Age'] = 2
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 3

#### Preprocessing Sex

In [19]:
for dataset in data:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

#### Preprocessing Parch and SibSp

In [20]:
for dataset in data:
    dataset['FamM'] = dataset['SibSp'] + dataset['Parch']
    dataset['Alone'] = 0
    dataset.loc[dataset['FamM'] == 0, 'Alone'] = 1
    dataset.drop(['SibSp','Parch','FamM'],axis=1,inplace=True)

#### Preprocessing Fare

In [21]:
data[1]['Fare'].fillna(data[1]['Fare'].mean(),inplace = True)

#  Correlation

In [22]:
data[0].corr()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,Alone
Survived,1.0,-0.338481,0.064779,0.543351,-0.046674,0.257307,-0.167675,-0.203367
Pclass,-0.338481,1.0,0.149374,-0.1319,-0.316668,-0.5495,0.162098,0.135207
Name,0.064779,0.149374,1.0,0.043178,-0.474037,0.012925,-0.019736,-0.013292
Sex,0.543351,-0.1319,0.043178,1.0,-0.050193,0.182333,-0.108262,-0.303646
Age,-0.046674,-0.316668,-0.474037,-0.050193,1.0,0.092854,-0.017504,0.136814
Fare,0.257307,-0.5495,0.012925,0.182333,0.092854,1.0,-0.224719,-0.271832
Embarked,-0.167675,0.162098,-0.019736,-0.108262,-0.017504,-0.224719,1.0,0.063532
Alone,-0.203367,0.135207,-0.013292,-0.303646,0.136814,-0.271832,0.063532,1.0


# Trainning Model

In [23]:
X_train = data[0].drop("Survived", axis=1)
Y_train = train_data["Survived"]
X_test = data[1]

In [24]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, Y_train)
print("Train Accuracy:",classifier.score(X_train, Y_train) * 100)

Train Accuracy: 94.1638608305275


# Predicting

In [25]:
Y_pred = classifier.predict(X_test)

In [26]:
predictions['Survived'] = Y_pred

In [27]:
predictions.to_csv("predictions.csv",index = False)