In [25]:
%matplotlib inline
import pandas as pd
import numpy as np
import random
from sklearn import svm, tree, neighbors, neural_network
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

#### Load in Dataset Titanic Kaggle

In [26]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

train data

In [27]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


test data

In [28]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [29]:
print(train_data.columns[train_data.isna().any()].tolist())

['Age', 'Cabin', 'Embarked']


In [30]:
# Set seed for reproducibility
random.seed(100)
np.random.seed(100)

train_data.drop(columns=['Name', 'Ticket'], inplace=True)
train_data.set_index(keys=['PassengerId'], drop=True, inplace=True)

test_data.drop(columns=['Name', 'Ticket'], inplace=True)
test_data.set_index(keys=['PassengerId'], drop=True, inplace=True)

train_nan_map = {'Age': train_data['Age'].mean(), 'Fare': train_data['Fare'].mean(), 'Embarked': train_data['Embarked'].mode()[0]}
test_nan_map = {'Age': test_data['Age'].mean(), 'Fare': test_data['Fare'].mean(), 'Embarked': test_data['Embarked'].mode()[0]}

train_data.fillna(value=train_nan_map, inplace=True)
test_data.fillna(value=test_nan_map, inplace=True)

columns_map = {'Embarked': {'C': 0, 'Q': 1, 'S': 2}, 'Sex': {'male': 0, 'female': 1}}
train_data.replace(columns_map, inplace=True)
test_data.replace(columns_map, inplace=True)

X_train = train_data.loc[:, train_data.columns != 'Survived']
y_train = train_data.loc[:, 'Survived']

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=10)

In [31]:
print(X_train.head())
print(y_train.head())
print(test_data.head())
print(f"Test Mean: {y_test.mean()}")
print(f"Train Mean: {y_train.mean()}")

             Pclass  Sex        Age  SibSp  Parch    Fare Cabin  Embarked
PassengerId                                                              
464               2    0  48.000000      0      0  13.000   NaN         2
160               3    0  29.699118      8      2  69.550   NaN         2
48                3    1  29.699118      0      0   7.750   NaN         1
403               3    1  21.000000      1      0   9.825   NaN         2
619               2    1   4.000000      2      1  39.000    F4         2
PassengerId
464    0
160    0
48     1
403    0
619    1
Name: Survived, dtype: int64
             Pclass  Sex   Age  SibSp  Parch     Fare Cabin  Embarked
PassengerId                                                          
892               3    0  34.5      0      0   7.8292   NaN         1
893               3    1  47.0      1      0   7.0000   NaN         2
894               2    0  62.0      0      0   9.6875   NaN         1
895               3    0  27.0      0      0  

In [32]:
train_data.columns[train_data.isna().any()].tolist()
train_data[train_data['Embarked'].isna() == True]
train_data.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,204,891.0
unique,,,,,,,,147,
top,,,,,,,,B96 B98,
freq,,,,,,,,4,
mean,0.383838,2.308642,0.352413,29.699118,0.523008,0.381594,32.204208,,1.536476
std,0.486592,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429,,0.791503
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,,1.0
50%,0.0,3.0,0.0,29.699118,0.0,0.0,14.4542,,2.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,,2.0


check if there are any missing values

In [37]:
#  replace null values for cabin with 'U' for unknown
train_data['Cabin'].fillna('U', inplace=True)
test_data['Cabin'].fillna('U', inplace=True)

# label encoding for all cabin values
# if the data is not int, then we can apply the label encoding
if train_data['Cabin'].dtype == 'object':
    train_data['Cabin'] = train_data['Cabin'].apply(lambda x: x[0])
    test_data['Cabin'] = test_data['Cabin'].apply(lambda x: x[0])

    le = LabelEncoder()
    le.fit(train_data['Cabin'])
    train_data['Cabin'] = le.transform(train_data['Cabin'])
    test_data['Cabin'] = le.transform(test_data['Cabin'])
    
else:
    print(train_data)

             Survived  Pclass  Sex        Age  SibSp  Parch     Fare  Cabin  \
PassengerId                                                                   
1                   0       3    0  22.000000      1      0   7.2500      8   
2                   1       1    1  38.000000      1      0  71.2833      2   
3                   1       3    1  26.000000      0      0   7.9250      8   
4                   1       1    1  35.000000      1      0  53.1000      2   
5                   0       3    0  35.000000      0      0   8.0500      8   
...               ...     ...  ...        ...    ...    ...      ...    ...   
887                 0       2    0  27.000000      0      0  13.0000      8   
888                 1       1    1  19.000000      0      0  30.0000      1   
889                 0       3    1  29.699118      1      2  23.4500      8   
890                 1       1    0  26.000000      0      0  30.0000      2   
891                 0       3    0  32.000000      0

In [34]:

def save_predictions_to_csv(test_data, test_predictions, filename: str = "titanic_prediction_group_4_name.csv") -> None:
    """
    Save the predictions to a CSV file.

    Args:
    - predictions (list): List of predictions to save.
    - filename (str, optional): Name of the CSV file to save to. Defaults to "titanic_predictions_group_4_name.csv".

    Returns:
    None
    """
    # include passenger id
    # df = pd.DataFrame(predictions, columns=["PassengerId", "Survived"])
    # df.to_csv(filename, index=False)
    passid_prediction = {'PassengerId': test_data.index.values, 'Survived': test_predictions}
    df_predictions = pd.DataFrame(passid_prediction)
    df_predictions.to_csv(filename, index=False)