In [24]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier #Random Forest 

from sklearn.preprocessing import OneHotEncoder 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing objective function 


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [25]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df = df.drop(['Name', 'Cabin', 'Ticket'], axis = 1)

In [26]:
sexEncoder = pd.get_dummies(df['Sex'])
df = df.drop('Sex', axis = 1)
df = df.join(sexEncoder)
embarkEncoder = pd.get_dummies(df['Embarked'])
df = df.drop('Embarked', axis = 1)
df = df.join(embarkEncoder)

In [27]:
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean') #could also use mean, median, most freq
df = pd.DataFrame(imp.fit_transform(df), columns = df.columns)
x_train, x_test, y_train, y_test = train_test_split(df.drop('Survived', axis = 1), df['Survived'])
# dfTrainY = np.ravel(dfTrainY, order = 'C')
# dfTrainY.shape

In [28]:
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 150)),
    'max_depth': hp.choice('max_depth', [1, 5, 10, 20, 50, 75, 100, 150, 200]),
    'min_samples_split': hp.choice('min_samples_split', [2, 3, 4, 5, 10, 20]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 3, 4, 5]),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_features': hp.choice('max_features', ['sqrt', 'auto', 'log2'])
}

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

def objective(params):
    clf = RandomForestClassifier(**params)
    scores = cross_val_score(clf, x_train, y_train, cv = kfold, scoring = 'accuracy', n_jobs = -1)
    best_score = mean(scores) 
#     print(params)
    loss = -best_score
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

num_trials = Trials()
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 1000, trials = num_trials)

100%|██████████| 1000/1000 [07:51<00:00,  2.12trial/s, best loss: -0.8233868252721356]


In [29]:
randomForest = RandomForestClassifier(n_estimators = space_eval(space, best)['n_estimators'], max_depth = space_eval(space, best)['max_depth'],
                                     min_samples_split = space_eval(space, best)['min_samples_split'], min_samples_leaf = space_eval(space, best)['min_samples_leaf'],
                                     bootstrap = space_eval(space, best)['bootstrap'], criterion = space_eval(space, best)['criterion'], max_features = space_eval(space, best)['max_features'])
randomForest.fit(x_train, y_train)
randomForest.score(x_test, y_test)

0.820627802690583

In [30]:
titanic_in = pd.read_csv('/kaggle/input/titanic/test.csv')
titanic_Passengers = titanic_in['PassengerId']
titanic_in = titanic_in.drop(['Name', 'Cabin', 'Ticket'], axis = 1)
sexEncoder = pd.get_dummies(titanic_in['Sex'])
titanic_in = titanic_in.drop('Sex', axis = 1)
titanic_in = titanic_in.join(sexEncoder)
embarkEncoder = pd.get_dummies(titanic_in['Embarked'])
titanic_in = titanic_in.drop('Embarked', axis = 1)
titanic_in = titanic_in.join(embarkEncoder)
titanic_imp = SimpleImputer(missing_values = np.nan, strategy = 'mean') #could also use mean, median, most freq
titanic_in = pd.DataFrame(imp.fit_transform(titanic_in), columns = titanic_in.columns)
titanic_out = randomForest.predict(titanic_in)
titanic_out = [int(x) for x in titanic_out]
sub = pd.DataFrame({'PassengerId': titanic_Passengers, 'Survived': titanic_out})
sub.to_csv("submission.csv", index=False)
sub


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
