In [6]:
# Importing the necessary libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
import numpy as np
import re

 **After importing libraries, our code needs to read the csv files into ipynb file.**

In [7]:
df_train = pd.DataFrame(pd.read_csv('train.csv'))
# For simplicity
df = df_train
df_test = pd.DataFrame(pd.read_csv('test.csv'))


**In order to evaluate the dataset, let's look at first 5 elements of the dataset.**

In [8]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**Let's look at the number of null values in each column**

In [9]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

**Let's see the average age of each class.**

In [10]:
print(df.groupby(['Pclass']).mean()['Age'])
print('\n')
print(df.groupby(['Sex']).mean()['Age'])

Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64


Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64


**Now we will define a function to fill the NaN values within the dataset**

In [11]:
def age_nan(df):
    for i in df.Sex.unique():
        for j in df.Pclass.unique():
            x = df.loc[((df.Sex == i) & (df.Pclass == j)), 'Age'].mean()
            df.loc[((df.Sex == i) & (df.Pclass == j)), 'Age'] = df.loc[((df.Sex == i) & (df.Pclass == j)), 'Age'].fillna(x)

**Let's fill the NaN's.**

In [12]:
age_nan(df)
age_nan(df_test)

**Now, we can fill Na values with "S" in Embarked column and with an integer in Cabin columns.**

In [13]:
df['Embarked'] = df['Embarked'].fillna('S')
df_test['Embarked'] = df_test['Embarked'].fillna('S')

df['Cabin_NaN'] = df['Cabin'].isnull().astype(int)
df_test['Cabin_NaN'] = df_test['Cabin'].isnull().astype(int)

**Final check and marking Na's with -1 in Fare column.**

In [14]:
df_test.isnull().sum()

df_test.Fare = df_test.Fare.fillna(-1)

**Applying Repeated Number of Folds Method(RKFold)**

In [15]:
def reg_cross_val(variables):
    X = df[variables]
    y = df['Survived']

    rkfold = RepeatedKFold(n_splits=2, n_repeats=10, random_state=10)
    result = []
    for treino, teste in rkfold.split(X):
        X_train, X_test = X.iloc[treino], X.iloc[teste]
        y_train, y_test = y.iloc[treino], y.iloc[teste]

        reg = LogisticRegression(max_iter=500)
        reg.fit(X_train, y_train)
        result.append(reg.score(X_test, y_test))

    return np.mean(result)

**Converting "female" and "male" strings to binary 1 and 0 respectively in order to ease our job on estimation.**

In [16]:
def is_female(x):
    if x == 'female':
        return 1
    else:
        return 0

In [17]:
df['Sex_bin'] = df['Sex'].map(is_female)
df_test['Sex_bin'] = df_test['Sex'].map(is_female)


**Converting the Embarked column status to 1 and zeros by reducing the variability but increasing the performance of the model. Results will be evaluated to verify this step.**

In [18]:
def embarked_s(x):
    if x == 'S':
        return 1
    else:
        return 0


df['Embarked_S'] = df['Embarked'].map(embarked_s)
df_test['Embarked_S'] = df_test['Embarked'].map(embarked_s)

In [19]:
def embarked_c(x):
    if x == 'C':
        return 1
    else:
        return 0

**Variable List**

In [20]:
df['Embarked_C'] = df['Embarked'].map(embarked_c)
df_test['Embarked_C'] = df_test['Embarked'].map(embarked_c)

variables_before = ['Age', 'Pclass', 'Fare', 'SibSp', 'Parch']
print('Before the new features:', reg_cross_val(variables_before))

variables = ['Age', 'Sex_bin', 'Pclass', 'Fare', 'SibSp', 'Parch', 'Embarked_S',\
             'Embarked_C', 'Cabin_NaN']

Before the new features: 0.7006696226129894


In [21]:
df['Family'] = df.SibSp + df.Parch
df_test['Family'] = df_test.SibSp + df_test.Parch

variables = ['Age', 'Sex_bin', 'Pclass', 'Fare', 'Embarked_S',\
             'Embarked_C', 'Cabin_NaN', 'Family']

**Cross Validating Variables**

In [22]:
reg_cross_val(variables)

0.8018995314153272

**Defining test and training data, fitting model and obtaining accuracy for part of the train data**

In [23]:
text_ticket = ''
for i in df.Ticket:
    text_ticket += i

lista = re.findall('[a-zA-Z]+', text_ticket)
print('Most repeated terms in Tickets: \n')
print(pd.Series(lista).value_counts().head(10))

df['CA'] = df['Ticket'].str.contains('CA|C.A.').astype(int)
df['SOTON'] = df['Ticket'].str.contains('SOTON|STON').astype(int)
df['PC'] = df['Ticket'].str.contains('PC').astype(int)
df['SC'] = df['Ticket'].str.contains('SC|S.C').astype(int)
df['C'] = df['Ticket'].str.contains('C').astype(int)



df_test['CA'] = df_test['Ticket'].str.contains('CA|C.A.').astype(int)
df_test['SOTON'] = df_test['Ticket'].str.contains('SOTON|STON').astype(int)
df_test['PC'] = df_test['Ticket'].str.contains('PC').astype(int)
df_test['SC'] = df_test['Ticket'].str.contains('SC|S.C').astype(int)
df_test['C'] = df_test['Ticket'].str.contains('C').astype(int)
text_name = ''
for i in df.Name:
    text_name += i

lista = re.findall('[a-zA-Z]+', text_name)
print('Most repeated words in Name column: \n')
print(pd.Series(lista).value_counts().head(10))

df['Master'] = df['Name'].str.contains('Master').astype(int)
df['Mr'] = df['Name'].str.contains('Mr').astype(int)
df['Miss'] = df['Name'].str.contains('Miss').astype(int)
df['Mrs'] = df['Name'].str.contains('Mrs').astype(int)



df_test['Master'] = df_test['Name'].str.contains('Master').astype(int)
df_test['Mr'] = df_test['Name'].str.contains('Mr').astype(int)
df_test['Miss'] = df_test['Name'].str.contains('Miss').astype(int)
df_test['Mrs'] = df_test['Name'].str.contains('Mrs').astype(int)

variables = ['Age', 'Sex_bin', 'Pclass', 'Fare', 'Embarked_S','Embarked_C',\
             'CA', 'SOTON', 'PC', 'SC','C', 'Mr', 'Miss', 'Master', 'Mrs', 'Family']

print(reg_cross_val(variables))
variables = ['Age', 'Sex_bin', 'Pclass', 'Fare','Family', 'Embarked_S','Embarked_C','Cabin_NaN',\
             'CA', 'SOTON', 'PC', 'SC', 'Master', 'Mr', 'Miss', 'C', 'Mrs']

X = df[variables]
y = df['Survived']




Most repeated terms in Tickets: 

C        63
PC       60
A        58
O        37
SOTON    18
STON     18
S        15
SC       13
W        13
CA       13
dtype: int64
Most repeated words in Name column: 

Mr         521
Miss       182
Mrs        129
William     49
Master      40
John        28
Henry       19
Thomas      17
Charles     17
George      16
dtype: int64
0.8251342772207387


**Fitting the model to test data and obtaining a guess for test data**

In [26]:
reg = LogisticRegression(max_iter = 500)
reg.fit(X,y)
resp = reg.predict(df_test[variables])
resp

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,