# TITANIC DATA AND PREDICTIONS

The machine learning models have been constructed using the training set, which includes the provided outcome for each passenger, based on features such as gender and class. New features have also been engineered.

The test set has been employed to assess the model's performance on previously unseen data. 

In [1]:
# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')


# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# miscellaneous
import warnings
warnings.filterwarnings("ignore")


## Upload data and Feature Engineeering

In [2]:
titanic = pd.read_csv("./data/train.csv", index_col = 0)
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
titanic.info()

In [None]:
titanic.isna().sum()

### Data Cleaning

The current state of the "Name" column is disorganized. We will tidy it up by grouping entries and assigning new titles, resulting in the creation of a new column.

In [3]:
# 1. find the list of unique titles
titanic.Name.unique()

array(['Braund, Mr. Owen Harris',
       'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
       'Heikkinen, Miss. Laina',
       'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
       'Allen, Mr. William Henry', 'Moran, Mr. James',
       'McCarthy, Mr. Timothy J', 'Palsson, Master. Gosta Leonard',
       'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
       'Nasser, Mrs. Nicholas (Adele Achem)',
       'Sandstrom, Miss. Marguerite Rut', 'Bonnell, Miss. Elizabeth',
       'Saundercock, Mr. William Henry', 'Andersson, Mr. Anders Johan',
       'Vestrom, Miss. Hulda Amanda Adolfina',
       'Hewlett, Mrs. (Mary D Kingcome) ', 'Rice, Master. Eugene',
       'Williams, Mr. Charles Eugene',
       'Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)',
       'Masselmani, Mrs. Fatima', 'Fynney, Mr. Joseph J',
       'Beesley, Mr. Lawrence', 'McGowan, Miss. Anna "Annie"',
       'Sloper, Mr. William Thompson', 'Palsson, Miss. Torborg Danira',
       'Asplund, Mrs. Carl Oscar 

In [5]:
# Several funtions for different columns.

def extract_titles(name):
    """We will extract the titles of passengers from the name column"""
    title = name.apply(lambda x: x.split(',')[1].split('.')[0].lower().strip())
    return title

def tidy_title(x):
    """We will group similar titles with each other"""
    if x in ['mrs','mr','miss','master','dr','rev']:
        return x
    elif x in ['mlle','ms']:
        return "miss"
    elif x in ['mme']:
        return 'mrs'
    elif x in ['col','major','capt']:
        return 'army'
    elif x in ['don','lady','the countess','sir','the count','madam','lord']:
        return 'nobl'
    else :
        return 'unknown'

def tidy_titles(dirty_titles):
    return dirty_titles.apply(tidy_title)

def cabin_flag(cabin):
    """Known cabin IDs replaced with 1, missing ones replaced with 0"""
    cabin = cabin.fillna(0, inplace=False) 
    known_cabin = cabin.apply(lambda x: 1 if x!=0 else 0)
    return known_cabin

def fare_per_person(fare, SibSp, Parch):
    """The fare split among the number of passengers traveling together"""
    return fare/(SibSp + Parch +1)



In [6]:
def prepare(df):
    """This function is written to clean the whole dataset at ones"""
    df["Title"] = extract_titles(df["Name"])
    df["Title"] = tidy_titles(df["Title"])
    df['KnownCabin'] = cabin_flag(df["Cabin"])
    df["FarePP"]= fare_per_person(df["Fare"], df["SibSp"], df["Parch"])
    df["Embarked"]=df["Embarked"].fillna("S")
    return df

In [7]:
prepare(titanic)
titanic

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,KnownCabin,FarePP
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,mr,0,3.62500
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,mrs,1,35.64165
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,miss,0,7.92500
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,mrs,1,26.55000
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,mr,0,8.05000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,rev,0,13.00000
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,miss,1,30.00000
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,miss,0,5.86250
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,mr,1,30.00000


In [8]:
titanic.isna().sum()

Survived        0
Pclass          0
Name            0
Sex             0
Age           177
SibSp           0
Parch           0
Ticket          0
Fare            0
Cabin         687
Embarked        0
Title           0
KnownCabin      0
FarePP          0
dtype: int64

### Train Test Split for Logistic Regression

In [21]:
# Spliting the data 
train,test = train_test_split(titanic, test_size=0.2, random_state=101)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [22]:
train.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,KnownCabin,FarePP
0,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S,mr,0,13.0
1,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.55,E17,S,mr,1,26.55
2,1,3,"Sheerlinck, Mr. Jan Baptist",male,29.0,0,0,345779,9.5,,S,mr,0,9.5


In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    712 non-null    int64  
 1   Pclass      712 non-null    int64  
 2   Name        712 non-null    object 
 3   Sex         712 non-null    object 
 4   Age         577 non-null    float64
 5   SibSp       712 non-null    int64  
 6   Parch       712 non-null    int64  
 7   Ticket      712 non-null    object 
 8   Fare        712 non-null    float64
 9   Cabin       157 non-null    object 
 10  Embarked    712 non-null    object 
 11  Title       712 non-null    object 
 12  KnownCabin  712 non-null    int64  
 13  FarePP      712 non-null    float64
dtypes: float64(3), int64(5), object(6)
memory usage: 78.0+ KB


### Feature and Target Seperation

In [24]:
# Feature seperation  
num_features = ["Age","FarePP", "SibSp", "Parch"]
cat_features = ["Pclass", "Sex", "KnownCabin", "Title", "Embarked"]
features = num_features + cat_features

# Target Separation
target = 'Survived'

# Feature and target columns
X_train,y_train = train[features], train[target]

In [25]:
X_train.head()

Unnamed: 0,Age,FarePP,SibSp,Parch,Pclass,Sex,KnownCabin,Title,Embarked
0,23.0,13.0,0,0,2,male,0,mr,S
1,51.0,26.55,0,0,1,male,1,mr,S
2,29.0,9.5,0,0,3,male,0,mr,S
3,40.0,44.833333,1,1,1,female,1,mrs,C
4,6.0,16.5,0,1,2,female,0,miss,S


In [26]:
y_train

0      0
1      1
2      1
3      1
4      1
      ..
707    0
708    1
709    1
710    1
711    0
Name: Survived, Length: 712, dtype: int64

### Model Structure

In [27]:
num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaling', StandardScaler()),
    ])

In [28]:
# column transformation
cat_transformer = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(drop='first'))
    ])

In [29]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_transformer', num_transformer, num_features),
        ('cat_transformer', cat_transformer, cat_features)
    ])

In [30]:
classifier_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

### Fitting the Model

In [35]:
classifier_model.fit(X_train,y_train)

In [36]:
classifier_model.named_steps['classifier'].coef_

array([[-0.30943916,  0.2157057 , -0.47844663, -0.22157921, -0.03488093,
        -1.11899039, -1.35031693,  0.99079369, -0.31259624,  1.83144852,
         0.21089798, -1.211102  ,  0.80755519,  0.013303  , -1.02563751,
        -0.31906629, -0.12023387, -0.39789838]])

In [43]:
X_train.head()

Unnamed: 0,Age,FarePP,SibSp,Parch,Pclass,Sex,KnownCabin,Title,Embarked
0,23.0,13.0,0,0,2,male,0,mr,S
1,51.0,26.55,0,0,1,male,1,mr,S
2,29.0,9.5,0,0,3,male,0,mr,S
3,40.0,44.833333,1,1,1,female,1,mrs,C
4,6.0,16.5,0,1,2,female,0,miss,S


In [38]:
training_acccuracy = classifier_model.score(X_train,y_train)
print(f"training accuracy: {round(training_acccuracy, 6)}")

training accuracy: 0.838483


### Prediction for the Test Data

In [39]:
X_test, y_test = test[features], test[target]

In [46]:
test_acccuracy = classifier_model.score(X_test, y_test)
print(f"test accuracy: {round(test_acccuracy, 6)}")

test accuracy: 0.832402


In [50]:
y_pred = classifier_model.predict(X_test)

In [51]:
y_pred

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0])

## KAGGLE PREDICTIONS

### Model based on *complete* training data

In [58]:
X_titanic, y_titanic = titanic[features], titanic[target]

In [59]:
classifier_model_complete = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

In [60]:
classifier_model_complete.fit(X_titanic, y_titanic)

In [61]:
full_training_acccuracy = classifier_model_complete.score(X_titanic, y_titanic)
print(f"full_training accuracy: {round(full_training_acccuracy, 6)}")

full_training accuracy: 0.837262


### Load kaggle test data

In [62]:
kaggle = pd.read_csv("./data/test.csv") 
kaggle

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


### Cleaning kaggle data

In [63]:
prepare(kaggle)
kaggle

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,KnownCabin,FarePP
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,mr,0,7.829200
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,mrs,0,3.500000
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,mr,0,9.687500
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,mr,0,8.662500
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,mrs,0,4.095833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,mr,0,8.050000
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,unknown,1,108.900000
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,mr,0,7.250000
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,mr,0,8.050000


### Predictions for kaggle test data

In [64]:
X_kaggle = kaggle[features]

In [65]:
y_pred_kaggle = classifier_model_complete.predict(X_kaggle)
y_pred_kaggle

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

### The Predictions for Kaggle Competition

In [66]:
y_pred_kaggle_df = pd.DataFrame(y_pred_kaggle, columns=["Survived"])
y_pred_kaggle_df

Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0


In [67]:
Pass_id_col = kaggle[["PassengerId"]]
Pass_id_col

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
...,...
413,1305
414,1306
415,1307
416,1308


In [69]:
y_pred_kaggle_df = pd.merge(Pass_id_col, y_pred_kaggle_df, left_index=True, right_index=True)
y_pred_kaggle_df


Unnamed: 0,PassengerId_x,PassengerId_y,Survived
0,892,892,0
1,893,893,1
2,894,894,0
3,895,895,0
4,896,896,1
...,...,...,...
413,1305,1305,0
414,1306,1306,1
415,1307,1307,0
416,1308,1308,0


In [70]:
y_pred_kaggle_df.to_csv("kaggle_titanic_predictions.csv", index = False)

***** The result of the Kaggle competition was 0.74.