In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


In [2]:
train_data = pd.read_csv("./data/train.csv", index_col="PassengerId")
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [4]:
train_data['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [5]:
one_hot = Pipeline([
    ("one-hot", OneHotEncoder()),
])

min_max = Pipeline([
    ("minmax", StandardScaler()),
])

median_imputer = Pipeline([
    ("median", SimpleImputer(strategy="median")),
])

mf_imputer = Pipeline([
    ("most_frequent", SimpleImputer(strategy="most_frequent"))
])

In [6]:
def smart_median(data: pd.DataFrame): 
    ages = []
    for _, x in data.iterrows():
        if pd.isna(x['Age']):
            condition = (data['Sex'] == x['Sex']) & (data['Pclass'] == x['Pclass']) & (data['Parch'] == x['Parch']) & (data['SibSp'] == x['SibSp'])
            df = data[condition].dropna()
            ages.append(df['Age'].median())
        else:
            ages.append(x['Age'])
    return ages

In [7]:
def extract_tickets(data: pd.DataFrame):
    tickets = []
    for _, x in data.iterrows():
        numbers =  re.findall(r"\d+", x['Ticket'])
        if len(numbers) > 0:
            tickets.append(int(numbers[-1]))
        else:
            tickets.append(np.nan)
    return tickets

In [8]:
def preprocess(data: pd.DataFrame, fit=True):
  
    data = data.copy()

    cabin_letters = []
    cabin_numbers = []

    for _, x in data.iterrows():
        if not pd.isna(x["Cabin"]):
            if len(x["Cabin"]) > 1:
                cabin_letters.append(x["Cabin"][0])
                cabin_numbers.append(np.median(list(map(int, re.findall(r"\d+", x["Cabin"])))))
            else:
                cabin_letters.append(str(x["Cabin"]))
                cabin_numbers.append(0)
        else:
            cabin_letters.append(np.nan)
            cabin_numbers.append(np.nan)

    data.insert(len(data.columns), value=cabin_letters, column="CabinLetter")
    data.insert(len(data.columns), value=cabin_numbers, column="CabinNumber")

    data['Age'] = smart_median(data)
    data['Ticket'] = extract_tickets(data)

    drop_columns = ["Name", "Cabin"]
    data = data.drop(columns=drop_columns)

    min_max_columns = ["Age", "Fare"]
    median_imputer_columns = ["Age", "Fare", "CabinNumber", "Ticket"]
    mf_imputer_columns = ["Embarked", "SibSp", "Parch", "Sex", "CabinLetter"]
    one_hot_columns = ["Embarked", "Sex", "CabinLetter"]


    if fit:
        min_max.fit(data[min_max_columns])
        median_imputer.fit(data[median_imputer_columns])
        mf_imputer.fit(data[mf_imputer_columns])
    
    data[min_max_columns] = min_max.transform(data[min_max_columns])
    data[median_imputer_columns] = median_imputer.transform(data[median_imputer_columns])
    data[mf_imputer_columns] = mf_imputer.transform(data[mf_imputer_columns])
    
    if fit:
        one_hot.fit(data[one_hot_columns])
 

    df = pd.DataFrame(one_hot.transform(data[one_hot_columns]).toarray(), 
                        columns=one_hot.get_feature_names_out(), index=data.index)
    data.drop(columns=one_hot_columns, inplace=True)
    data = pd.merge(data, df, left_index=True, right_index=True)

    data.insert(len(data.columns), "FamilySize", data["SibSp"] + data["Parch"])

    return data


In [9]:
train = preprocess(train_data)

In [10]:
train['Age'].info()

<class 'pandas.core.series.Series'>
Int64Index: 891 entries, 1 to 891
Series name: Age
Non-Null Count  Dtype  
--------------  -----  
891 non-null    float64
dtypes: float64(1)
memory usage: 46.2 KB


In [11]:
X = train.loc[:, train.columns != "Survived"]
y = train["Survived"]

In [12]:
params = {
    "n_estimators": np.arange(10, 200, 25),
    "max_depth": np.arange(10, 30, 5),
    "min_samples_split": [2, 4, 8, 16, 32, 64],
    "min_samples_leaf": [1, 2, 4, 8]
}

CV = GridSearchCV(RandomForestClassifier(), param_grid=params, scoring="accuracy", cv=5, n_jobs=-1)
CV.fit(X, y)
CV.best_estimator_

In [13]:
CV.best_score_

0.8384031134266523

In [14]:
test_data = pd.read_csv("./data/test.csv", index_col="PassengerId")
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [16]:
test = preprocess(test_data, fit=False)
test.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Ticket,Fare,CabinNumber,Embarked_C,Embarked_Q,Embarked_S,...,Sex_male,CabinLetter_A,CabinLetter_B,CabinLetter_C,CabinLetter_D,CabinLetter_E,CabinLetter_F,CabinLetter_G,CabinLetter_T,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,3,0.324973,0,0,330911.0,-0.490783,40.5,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
893,3,1.23926,1,0,363272.0,-0.507479,40.5,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
894,2,2.336405,0,0,240276.0,-0.453367,40.5,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
895,3,-0.2236,0,0,315154.0,-0.474005,40.5,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
896,3,-0.589315,1,1,3101298.0,-0.401017,40.5,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2


In [17]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pclass         418 non-null    int64  
 1   Age            418 non-null    float64
 2   SibSp          418 non-null    object 
 3   Parch          418 non-null    object 
 4   Ticket         418 non-null    float64
 5   Fare           418 non-null    float64
 6   CabinNumber    418 non-null    float64
 7   Embarked_C     418 non-null    float64
 8   Embarked_Q     418 non-null    float64
 9   Embarked_S     418 non-null    float64
 10  Sex_female     418 non-null    float64
 11  Sex_male       418 non-null    float64
 12  CabinLetter_A  418 non-null    float64
 13  CabinLetter_B  418 non-null    float64
 14  CabinLetter_C  418 non-null    float64
 15  CabinLetter_D  418 non-null    float64
 16  CabinLetter_E  418 non-null    float64
 17  CabinLetter_F  418 non-null    float64
 18  CabinLe

In [18]:
y_pred = CV.best_estimator_.predict(test)

In [19]:
pd.DataFrame(
    {
        "Survived": y_pred
    }, index=pd.Index(test.index, name="PassengerId")
).to_csv("predicted.csv")


In [20]:
!kaggle competitions submit -c titanic -f predicted.csv -m "Std scaler"

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|██████████| 3.18k/3.18k [00:00<00:00, 8.69kB/s]
100%|██████████| 3.18k/3.18k [00:01<00:00, 1.96kB/s]
