In [1]:
# Import helpful libraries
from copy import deepcopy
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, \
    precision_score, recall_score, f1_score, confusion_matrix, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, validation_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler

from scipy import stats
from yellowbrick.model_selection import LearningCurve, ValidationCurve, cv_scores
from yellowbrick.classifier import ROCAUC


# Utilities for this project
from transformers import ColumnsRemover, StringColumnsRemover, NullThresholdColumnsRemover
from loggers import ResultsLogger
logger = ResultsLogger()

# Hides warnings
import warnings
warnings.filterwarnings('ignore')

# Other helpful settings
pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 50)
pd.options.display.float_format = '{:,.2f}'.format

train_data = pd.read_csv('./data/train.csv', index_col=0)
test_data = pd.read_csv('./data/test.csv', index_col=0)

print(train_data.shape, test_data.shape)

(891, 11) (418, 10)


In [2]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [4]:
train_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.38,2.31,29.7,0.52,0.38,32.2
std,0.49,0.84,14.53,1.1,0.81,49.69
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.12,0.0,0.0,7.91
50%,0.0,3.0,28.0,0.0,0.0,14.45
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.33


In [5]:
df = train_data
y_column = 'Survived'

string_cols = list(df.select_dtypes(include=['object']).columns)
string_cols.remove('Name')
string_cols.remove('Cabin')
string_cols.remove('Embarked')
                   
numeric_cols = list(df.select_dtypes(include=['int64', 'float64']).columns)
numeric_cols.remove(y_column)

ignore_cols = ['Name', 'Cabin', 'Embarked']


print(f'{string_cols = }')
print(f'{numeric_cols = }')
print(f'{ignore_cols = }')

string_cols = ['Sex', 'Ticket']
numeric_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
ignore_cols = ['Name', 'Cabin', 'Embarked']


In [6]:
df[y_column].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [None]:
tickets = df['Ticket'].unique()
sorted(tickets)

In [29]:
def get_ticket_prefix(ticket):
    if not ' ' in ticket:
        return np.nan
    return ticket.split(' ')[0]

def remove_ticket_prefix(ticket):
    if not ' ' in ticket:
        return ticket
    return ticket.split(' ')[1]

df['TicketPrefix'] = df['Ticket'].map(get_ticket_prefix)
df['Ticket'] = df['Ticket'].map(remove_ticket_prefix)

df['Ticket']


PassengerId
1        [21171]
2        [17599]
3      [3101282]
4         113803
5         373450
         ...    
887       211536
888       112053
889       [6607]
890       111369
891       370376
Name: Ticket, Length: 891, dtype: object

In [25]:
a = [1,2,3,4]
a[1:]

[2, 3, 4]