#### Import neccessary libraries 

In [59]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

#### Load datasets

In [60]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#Saving it here to use in submission
test_passId = test['PassengerId'] 

#### Data cleaning and some preprocessing

In [61]:
def clean(data):
    
    drop = ['PassengerId', 'Cabin', 'Name', 'Ticket']
    #Drop uneccessary columns 
    data.drop(columns=drop, inplace=True)  

    columns = ['Age', 'Fare', 'SibSp', 'Parch']
    #Fill all Nan values in numerical columns by median
    for col in columns:
        if data[col].notna().sum() > 0: 
            data[col].fillna(data[col].median(), inplace=True)  
            
    #Fill all Nan values in categorical columns by mode
    if data['Embarked'].notna().sum() > 0:  
        data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True) 

    return data

train = clean(train)
test = clean(test)


In [62]:
#Using label encoder to transform our categorical data
label_encoder = LabelEncoder()

train['Sex'] = label_encoder.fit_transform(train['Sex'])
test['Sex'] = label_encoder.transform(test['Sex'])

train['Embarked'] = label_encoder.fit_transform(train['Embarked'])
test['Embarked'] = label_encoder.transform(test['Embarked'])

#### Feature engineering 

In [63]:
# Creating new features as Family size and IsAlone
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

train["IsAlone"] = (train["FamilySize"] == 1).astype(int)
test["IsAlone"] = (test["FamilySize"] == 1).astype(int)

#### Model training

In [64]:
#Separate to target variable
y = train['Survived']
X = train.drop(columns=['Survived'])

#Splitting to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
#The best one is Random Forest
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

#Searching best parameters by Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],    
    'max_depth': [None, 5, 10, 20],           
    'min_samples_split': [2, 5, 10],          
    'min_samples_leaf': [1, 2, 4],            
    'bootstrap': [True, False]}

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,    
    verbose=2)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)


0.8268156424581006
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


In [68]:
#Training the final model and submit our Data Frame with result
clf = RandomForestClassifier(bootstrap=True, max_depth=5, min_samples_leaf=4, min_samples_split=2, n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy_score(y_test, y_pred)

submission_predict = clf.predict(test)


#### Submission

In [67]:
df = pd.DataFrame({'PassengerId': test_passId.values,
                   'Survived': submission_predict})
df.to_csv('Titanic.csv', index=False)