## Importing the relevant libraries

In [1]:
import numpy as np
import pandas as pd

## Load the raw data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
titanic = train.append(test, ignore_index=True, sort=False)

titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Process the raw data

In [3]:
titanic['Title'] = titanic.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
print(titanic.Title.unique())

# normalize the titles
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

titanic.Title = titanic.Title.map(normalized_titles)

print(titanic.Title.value_counts())

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer' 'Dona']
Mr         757
Miss       262
Mrs        200
Master      61
Officer     23
Royalty      6
Name: Title, dtype: int64


In [4]:
groups = titanic.groupby(['Sex', 'Pclass', 'Title'])
titanic.Age = groups.Age.apply(lambda x: x.fillna(x.median()))

In [5]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [6]:
titanic.Cabin = titanic.Cabin.fillna('U')
titanic.Fare = titanic.Fare.fillna(titanic.Fare.median())
titanic.Cabin = titanic.Cabin.map(lambda x: x[0])

titanic['FamilySize'] = titanic.SibSp + titanic.Parch + 1
titanic = titanic.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Embarked', 'PassengerId'], axis=1)

print(titanic.isnull().sum())

Survived      418
Pclass          0
Sex             0
Age             0
Fare            0
Cabin           0
Title           0
FamilySize      0
dtype: int64


In [7]:
titanic = titanic.join(pd.get_dummies(titanic.Cabin, 'Cabin'))
titanic = titanic.join(pd.get_dummies(titanic.Title, 'Title'))
titanic.Sex = titanic.Sex.map({ 'male': 0, 'female': 1 })

titanic = titanic.drop(['Cabin', 'Title', 'Pclass'], axis=1)
print(titanic.head())

   Survived  Sex   Age     Fare  FamilySize  Cabin_A  Cabin_B  Cabin_C  \
0       0.0    0  22.0   7.2500           2        0        0        0   
1       1.0    1  38.0  71.2833           2        0        0        1   
2       1.0    1  26.0   7.9250           1        0        0        0   
3       1.0    1  35.0  53.1000           2        0        0        1   
4       0.0    0  35.0   8.0500           1        0        0        0   

   Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_U  Title_Master  \
0        0        0        0        0        0        1             0   
1        0        0        0        0        0        0             0   
2        0        0        0        0        0        1             0   
3        0        0        0        0        0        0             0   
4        0        0        0        0        0        1             0   

   Title_Miss  Title_Mr  Title_Mrs  Title_Officer  Title_Royalty  
0           0         1          0              0

## Scale data

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

targets = titanic.Survived
inputs = titanic.drop(['Survived'], axis=1)

scaler = StandardScaler()
scaler.fit(inputs)

inputs = scaler.transform(inputs)

In [9]:
size = len(targets)
real_test_size = targets.isnull().sum()
real_train_size = size - real_test_size

real_test_inputs = inputs[-real_test_size:size]

targets = targets.head(real_train_size)
inputs = inputs[0:real_train_size]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=365)

## Train the model

In [11]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression()
reg.fit(X_train, y_train)
reg.score(X_train, y_train)

0.8356741573033708

In [12]:
survivors = reg.predict(real_test_inputs).astype(int)

In [13]:
result = test[['PassengerId']].copy()
result['Survived'] = survivors

In [14]:
result.to_csv(path_or_buf='result-2.csv',index=False)