In [1496]:
import pandas as pd

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

features = ["Pclass", "Sex", "Age", "Embarked"] # not adding Fare since it's strongly corelated with Pclass.
X_train = pd.get_dummies(df_train[features], drop_first=True, columns=["Sex", "Embarked", "Pclass"]) # "Pclass" is categorical feature, instead of numeric
y_train = df_train['Survived']

X_test = pd.get_dummies(df_test[features], drop_first=True, columns=["Sex", "Embarked", "Pclass"])

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

X_train.head()

Unnamed: 0,Age,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
0,22.0,1,0,1,0,1
1,38.0,0,0,0,0,0
2,26.0,0,0,1,0,1
3,35.0,0,0,1,0,0
4,35.0,1,0,1,0,1


In [1497]:
# Doing some feature engineering here with (1) Name, (2) Cabin, and (3) Ticket


def add_feature_with_title_in_name(X, df, titles=["Miss.", "Mrs"]):
    for title in titles:
        X[title.lower().replace(".", "")] = df["Name"].str.contains(title, regex=False).astype(int)

        
# assumption: having cabin info may indicate higher survival rate, since only survivor could report their cabin info.
# check with: df_train.groupby(["cabin_info", "Survived"]).count()
def add_feature_with_cabin_info(X, df):
    X["cabin_abc"] = df["Cabin"].astype(str).str.contains("[ABC]").astype(int)
    X["cabin_de"] = df["Cabin"].astype(str).str.contains("[DE]").astype(int)
    X["cabin_fg"] = df["Cabin"].astype(str).str.contains("[FG]").astype(int)
    
    
# assumption: some alphatic Tickets would relate to surviing rate.
def add_feature_with_ticket(X, df, ticket_prefixes=["C.A. ", "PC", "STON/O.Q", "A/5"]):
    def parse_ticket(s):
        if s.isdigit():
            return ""
        else:
            for prefix in ticket_prefixes:
                if prefix in s:
                    return s.lower().split()[0].replace(".", "")
            return ""# no "other" to prevent overfitting
    
    X["ticket_prefix"] = df["Ticket"].apply(parse_ticket)
    X = pd.get_dummies(X, drop_first=True, columns=['ticket_prefix'])
    
    return X

# combine the the two features as family number size.
def add_feature_family_number(X, df):
    X["family_num"] = df["SibSp"] + df["Parch"]


add_feature_with_title_in_name(X_train, df_train)
add_feature_with_cabin_info(X_train, df_train)
add_feature_family_number(X_train, df_train)
X_train = add_feature_with_ticket(X_train, df_train)

add_feature_with_title_in_name(X_test, df_test)
add_feature_with_cabin_info(X_test, df_test)
add_feature_family_number(X_test, df_test)
X_test = add_feature_with_ticket(X_test, df_test)


X_train.head()


Unnamed: 0,Age,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,miss,mrs,cabin_abc,cabin_de,cabin_fg,family_num,ticket_prefix_a/5,ticket_prefix_ca,ticket_prefix_pc
0,22.0,1,0,1,0,1,0,0,0,0,0,1,1,0,0
1,38.0,0,0,0,0,0,0,1,1,0,0,1,0,0,1
2,26.0,0,0,1,0,1,1,0,0,0,0,0,0,0,0
3,35.0,0,0,1,0,0,0,1,1,0,0,1,0,0,0
4,35.0,1,0,1,0,1,0,0,0,0,0,0,0,0,0


In [1518]:
# Fill tne na in Age with KNN
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
X_train[:] = imputer.fit_transform(X_train) # https://stackoverflow.com/questions/56764044/scikit-problem-returning-dataframe-from-imputer-instead-of-numpy-array
X_test[:] = imputer.transform(X_test)       # Both fit_transform() and transform() return np.array instead of DataFrame, so we do a "X[:]" trick

print(X_train.isnull().sum())
X_train.head()

Age                  0
Sex_male             0
Embarked_Q           0
Embarked_S           0
Pclass_2             0
Pclass_3             0
miss                 0
mrs                  0
cabin_abc            0
cabin_de             0
cabin_fg             0
family_num           0
ticket_prefix_a/5    0
ticket_prefix_ca     0
ticket_prefix_pc     0
dtype: int64


Unnamed: 0,Age,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,miss,mrs,cabin_abc,cabin_de,cabin_fg,family_num,ticket_prefix_a/5,ticket_prefix_ca,ticket_prefix_pc
0,22.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,38.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,26.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,35.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1537]:
# Find the most frequent n_estimators number as best and the corresponding model with multipe random_state

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import accuracy_score


# Doing RandomForest with grid search, 5 is the best value for max_depth based on the current features, try enlarge it if more features are added.
param_grid_dict = {"max_depth": [5], "min_samples_leaf":np.linspace(0.02, 0.03, 10)}
   
model = RandomForestClassifier(n_estimators=30, random_state=16)
grid = GridSearchCV(estimator=model, param_grid=param_grid_dict, cv=8, n_jobs=8)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)


# Output the prediction to csv file
y_submit = pd.DataFrame(columns=['PassengerId', 'Survived'])
y_submit['PassengerId'] = df_test['PassengerId']
y_submit['Survived'] = y_pred
y_submit.to_csv('submissions.csv', header=True, index=False)
y_submit.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
