In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import (
    cross_val_score,
    RandomizedSearchCV,
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from scipy.stats import randint

In [115]:
df = pd.read_csv("train.csv")

In [116]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [117]:
df["Pclass"].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [118]:
df["Survived"].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [119]:
df["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [120]:
df["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [122]:
cat_pipeline = Pipeline(
    [("impute", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder())]
)

In [123]:
num_pipeline = Pipeline(
    [
        ("impute", KNNImputer(n_neighbors=3)),
        ("standarize", StandardScaler()),
    ]
)

In [124]:
log_transformer = Pipeline(
    steps=[
        ("imputer", KNNImputer(n_neighbors=3)),
        (
            "log_transform",
            FunctionTransformer(
                func=np.log1p, inverse_func=np.expm1, feature_names_out="one-to-one"
            ),
        ),
        ("standardize", StandardScaler()),
    ]
)

In [125]:
pipeline = ColumnTransformer(
    [
        ("drop", "drop", ["Ticket", "Cabin", "Name"]),
        ("log", log_transformer, ["Fare"]),
        ("cat", cat_pipeline, ["Pclass", "Sex", "Embarked"]),
        ("num", num_pipeline, ["Age", "SibSp", "Parch"]),
    ],
    remainder="passthrough",
)

In [126]:
final = pipeline.fit_transform(df)

In [127]:
final = pd.DataFrame(
    data=final, columns=pipeline.get_feature_names_out(), index=df.index
)

In [128]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   log__Fare               891 non-null    float64
 1   cat__Pclass_1           891 non-null    float64
 2   cat__Pclass_2           891 non-null    float64
 3   cat__Pclass_3           891 non-null    float64
 4   cat__Sex_female         891 non-null    float64
 5   cat__Sex_male           891 non-null    float64
 6   cat__Embarked_C         891 non-null    float64
 7   cat__Embarked_Q         891 non-null    float64
 8   cat__Embarked_S         891 non-null    float64
 9   num__Age                891 non-null    float64
 10  num__SibSp              891 non-null    float64
 11  num__Parch              891 non-null    float64
 12  remainder__PassengerId  891 non-null    float64
 13  remainder__Survived     891 non-null    float64
dtypes: float64(14)
memory usage: 97.6 KB


In [129]:
train_data = df.drop("Survived", axis=1)
train_labels = df["Survived"].copy()

In [133]:
scoring = make_scorer(accuracy_score)

In [139]:
tree_pipeline = make_pipeline(pipeline, DecisionTreeClassifier())
tree_result = cross_val_score(tree_pipeline, train_data, train_labels, scoring=scoring)

In [144]:
tree_result

array([0.59776536, 0.81460674, 0.73595506, 0.75280899, 0.82022472])

In [141]:
rf_pipeline = make_pipeline(pipeline, RandomForestClassifier())
rf_result = cross_val_score(rf_pipeline, train_data, train_labels, scoring=scoring)

In [143]:
rf_result

array([0.73184358, 0.79775281, 0.83707865, 0.8258427 , 0.81460674])

In [149]:
mlp_pipeline = make_pipeline(pipeline, MLPClassifier((20, 50), max_iter=100000))
mlp_result = cross_val_score(
    mlp_pipeline, train_data, train_labels, scoring=scoring, n_jobs=8
)

In [150]:
mlp_result

array([0.69273743, 0.78651685, 0.78651685, 0.76966292, 0.73033708])

In [153]:
knn_pipeline = make_pipeline(pipeline, KNeighborsClassifier(n_neighbors=3))
knn_result = cross_val_score(
    knn_pipeline, train_data, train_labels, scoring=scoring, n_jobs=8
)

In [154]:
knn_result

array([0.61452514, 0.38764045, 0.46067416, 0.42696629, 0.61235955])

In [161]:
cat_pipeline = Pipeline(
    [("impute", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder())]
)

num_pipeline = Pipeline(
    [
        ("impute", KNNImputer()),
        ("standarize", StandardScaler()),
    ]
)

log_transformer = Pipeline(
    steps=[
        ("impute", KNNImputer()),
        (
            "log_transform",
            FunctionTransformer(
                func=np.log1p, inverse_func=np.expm1, feature_names_out="one-to-one"
            ),
        ),
        ("standardize", StandardScaler()),
    ]
)

preprocessing = ColumnTransformer(
    [
        ("drop", "drop", ["Ticket", "Cabin", "Name"]),
        ("log", log_transformer, ["Fare"]),
        ("cat", cat_pipeline, ["Pclass", "Sex", "Embarked"]),
        ("num", num_pipeline, ["Age", "SibSp", "Parch"]),
    ],
    remainder="passthrough",
)

In [162]:
full_pipeline = Pipeline([
    ("preprocessing",preprocessing),
    ("random_forest", RandomForestClassifier())
])

In [163]:
param_distribs = {
    "preprocessing__log__impute__n_neighbors": randint(low=1, high=20),
    "preprocessing__num__impute__n_neighbors": randint(low=1, high=20),
    "random_forest__max_features": randint(low=2, high=20),
    "random_forest__n_estimators": randint(low=10, high=200),
}

In [164]:
rnd_search = RandomizedSearchCV(
    full_pipeline,
    param_distributions=param_distribs,
    n_iter=50,
    cv=10,
    scoring=scoring,
    n_jobs=8,
)

In [165]:
rnd_search.fit(train_data,train_labels)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [167]:
rnd_search.best_score_

0.830561797752809

In [166]:
rnd_search.best_params_

{'preprocessing__log__impute__n_neighbors': 14,
 'preprocessing__num__impute__n_neighbors': 17,
 'random_forest__max_features': 3,
 'random_forest__n_estimators': 182}

In [168]:
test_set = pd.read_csv("test.csv")

In [173]:
test_transformed = preprocessing.fit_transform(test_set)
test_pred = rnd_search.best_estimator_.predict(test_set)

In [175]:
test_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [188]:
answer = pd.DataFrame({"PassengerId": test_set["PassengerId"], "Survived": test_pred})

In [190]:
answer.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [191]:
answer.to_csv("answer.csv",index=False)