In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as sm
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

%matplotlib inline
train_df = pd.read_csv("/kaggle/input/mai-ml-decision-trees/train.csv")
test_df = pd.read_csv("/kaggle/input/mai-ml-decision-trees/test.csv")

def make_submission(ids, preds, output_path='submission.csv'):
    subm = pd.DataFrame()
    subm['PassengerId'] = ids
    subm['Survived'] = preds
    subm.to_csv(output_path, index=False)
train_df["Title"] = train_df.Name.apply(lambda x: x.split(" ")[1].replace(".", ""))
test_df["Title"] = test_df.Name.apply(lambda x: x.split(" ")[1].replace(".", ""))

title_stat = train_df.groupby("Title").Name.agg(["count"])
title_stat["is_rare"] = title_stat["count"] < 10
titles = title_stat[~title_stat.is_rare].index.values

train_df.Title = train_df.Title.apply(lambda x: x if x in titles else "rare_title")
test_df.Title = test_df.Title.apply(lambda x: x if x in titles else "rare_title")

train_df.head()

train_df["Children"] = train_df.Age.apply(lambda x: 1 if x < 18 else 0)
train_df["Mother"] = train_df.apply(lambda x: 1 if x.Sex == "female" and x.Age > 18 and x.Parch > 0 else 0, axis=1)
train_df["Family_size"] = train_df.SibSp + train_df.Parch + 1

test_df["Children"] = test_df.Age.apply(lambda x: 1 if x < 18 else 0)
test_df["Mother"] = test_df.apply(lambda x: 1 if x.Sex == "female" and x.Age > 18 and x.Parch > 0 else 0, axis=1)
test_df["Family_size"] = test_df.SibSp + test_df.Parch + 1

train_df["Deck"] = train_df.Cabin.astype("str").apply(lambda x: x[0] if x != "nan" else "no_deck")
test_df["Deck"] = test_df.Cabin.astype("str").apply(lambda x: x[0] if x != "nan" else "no_deck")

train_df.Age = train_df.Age.fillna(train_df.Age.dropna().median()).astype("int16")
train_df.Fare = train_df.Fare.fillna(train_df.Fare.dropna().median())
train_df.Embarked = train_df.Embarked.fillna("no_info")

test_df.Age = test_df.Age.fillna(train_df.Age.dropna().median()).astype("int16")
test_df.Fare = test_df.Fare.fillna(train_df.Fare.dropna().median())
test_df.Embarked = test_df.Embarked.fillna("no_info")

train_v1 = train_df.copy()
test_v1 = test_df.copy()

unnecessary_cols = ["Name", "Ticket", "SibSp", "Parch", "Cabin"]

train_v1 = train_v1.drop(unnecessary_cols, axis=1)
test_v1 = test_v1.drop(unnecessary_cols, axis=1)

encoder = LabelEncoder()
encoding_cols = ["Sex", "Embarked", "Title", "Deck"]

full_df = pd.concat([train_v1, test_v1], axis=0)

for col in encoding_cols:
    full_df[col] = encoder.fit_transform(full_df[col])
    
train_v1 = full_df[full_df.Survived.notnull()]
train_v1.Survived = train_v1.Survived.astype("int8")

test_v1 = full_df[full_df.Survived.isnull()]
test_v1.drop("Survived", axis=1, inplace=True)

feature_cols = [col for col in train_v1.columns if col not in {"PassengerId", "Survived"}]
target_col = "Survived"


X = train_v1[feature_cols].values
y = train_v1[target_col]

?RandomForestClassifier

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
grid= RandomForestClassifier(n_estimators=7000,max_depth=9,min_samples_leaf=40)
tree_params = {
    'max_depth': list(range(1, 8)),
    'min_samples_leaf': list(range(1, 10))
}
#grid=GridSearchCV(clf, param_grid= tree_params,cv=5, error_score='raise',verbose=1)
grid.fit(X,y)
res=grid.predict(test_v1[feature_cols].values);
make_submission(test_v1["PassengerId"],res);