In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.set_index('PassengerId', inplace=True)
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.set_index('PassengerId', inplace=True)
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
import re

def missing_age(df):
    df['Title'] = [re.search(r'(.*\,\s)(.*)(\..*)', name).group(2) for name in df.Name]
    mask = (df.Age.isna())
    df.loc[mask & (df.Title=='Master'), 'Age'] = 10
    df.loc[mask & (df.Title=='Dr'), 'Age'] = 50
    df.loc[mask & (df.Title=='Mr'), 'Age'] = 50
    df.loc[mask & (df.Title=='Mrs'), 'Age'] = 50
    df.loc[mask & (df.Title=='Miss'), 'Age'] = 10
    df.loc[mask & (df.Title=='Ms'), 'Age'] = 50
    return

missing_age(train_data)
missing_age(test_data)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]
X_test, X_train, y_test, y_train = train_test_split(train_data, y)

In [6]:
features = ["Pclass", "Sex", "SibSp", "Parch", "Age"]
X_train = pd.get_dummies(X_train[features])
X_test = pd.get_dummies(X_test[features])
test_data = pd.get_dummies(test_data[features])


In [7]:
model = RandomForestClassifier(n_estimators=100, max_depth=5)
model.fit(X_train, y_train)
predict = model.predict(X_test)
predict_train = model.predict(X_train)


#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#output.to_csv('submission.csv', index=False)
#print("Your submission was successfully saved!")

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(predict,y_test))
print(classification_report(predict,y_test))
print('Train set accuracy: ' + str(accuracy_score(predict_train, y_train)))
print('Test set accuracy: ' + str(accuracy_score(predict, y_test)))


[[370  95]
 [ 36 167]]
              precision    recall  f1-score   support

           0       0.91      0.80      0.85       465
           1       0.64      0.82      0.72       203

    accuracy                           0.80       668
   macro avg       0.77      0.81      0.78       668
weighted avg       0.83      0.80      0.81       668

Train set accuracy: 0.8923766816143498
Test set accuracy: 0.8038922155688623


In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier

loss = ['log_loss', 'exponential']
learning_rate = [0.01, 0.1, 0.5]
n_estimators = [10, 100, 500, 1000]
subsample = [0.1, 0.5, 0.8, 1]
criterion = ['friedman_mse', 'squared_error']
max_depth = [1, 3, 5]

random_grid = {'loss': loss,
               'learning_rate': learning_rate,
               'n_estimators': n_estimators,
               'subsample': subsample,
               'criterion': criterion,
               'max_depth': max_depth}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
model_search = GradientBoostingClassifier()
rf_random = RandomizedSearchCV(estimator = model_search,
                               param_distributions = random_grid, n_iter = 100, cv = 3,
                               verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=5, n_estimators=1000, subsample=0.5; total time=   0.0s
[CV] END criterion=friedman_mse, learning_rate=0.5, loss=exponential, max_depth=5, n_estimators=1000, subsample=0.8; total time=   1.0s
[CV] END criterion=friedman_mse, learning_rate=0.01, loss=exponential, max_depth=5, n_estimators=500, subsample=0.1; total time=   0.6s
[CV] END criterion=squared_error, learning_rate=0.5, loss=exponential, max_depth=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END criterion=squared_error, learning_rate=0.5, loss=log_loss, max_depth=1, n_estimators=1000, subsample=0.1; total time=   0.0s
[CV] END criterion=squared_error, learning_rate=0.5, loss=log_loss, max_depth=1, n_estimators=1000, subsample=0.1; total time=   0.0s
[CV] END criterion=squared_error, learning_rate=0.5, loss=log_loss, max_depth=1, n_estimators=1000, subsample=0.1; total time=  

165 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
165 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/ensemble/_gb.py", line 525, in fit
    self._check_params()
  File "/opt/conda/lib/python3.7/site-packages/sklearn/ensemble/_gb.py", line 282, in _check_params
    raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
ValueError: Loss 'log_loss' not supported. 

 0.82054054 0.83843844        nan 0.83399399        nan        nan
        nan 0.76654

RandomizedSearchCV(cv=3, estimator=GradientBoostingClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['friedman_mse',
                                                      'squared_error'],
                                        'learning_rate': [0.01, 0.1, 0.5],
                                        'loss': ['log_loss', 'exponential'],
                                        'max_depth': [1, 3, 5],
                                        'n_estimators': [10, 100, 500, 1000],
                                        'subsample': [0.1, 0.5, 0.8, 1]},
                   verbose=2)

In [10]:
rf_random.best_params_

{'subsample': 0.1,
 'n_estimators': 500,
 'max_depth': 1,
 'loss': 'exponential',
 'learning_rate': 0.01,
 'criterion': 'squared_error'}

In [11]:
best_random = rf_random.best_estimator_
best_random.fit(X_train, y_train)
predict_random = best_random.predict(X_test)
random_accuracy = accuracy_score(predict_random, y_test)
random_accuracy

0.7829341317365269

In [12]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [10, 100, 500]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [2, 5, 10, 50, 80, None]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [13]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
model_search = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = model_search,
                               param_distributions = random_grid, n_iter = 100, cv = 3,
                               verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits

[CV] END criterion=squared_error, learning_rate=0.5, loss=exponential, max_depth=1, n_estimators=500, subsample=1; total time=   0.3s
[CV] END criterion=squared_error, learning_rate=0.5, loss=exponential, max_depth=1, n_estimators=500, subsample=1; total time=   0.3s
[CV] END criterion=squared_error, learning_rate=0.5, loss=exponential, max_depth=1, n_estimators=500, subsample=1; total time=   0.3s
[CV] END criterion=squared_error, learning_rate=0.01, loss=log_loss, max_depth=5, n_estimators=500, subsample=0.5; total time=   0.0s
[CV] END criterion=squared_error, learning_rate=0.01, loss=log_loss, max_depth=5, n_estimators=500, subsample=0.5; total time=   0.0s
[CV] END criterion=squared_error, learning_rate=0.01, loss=log_loss, max_depth=5, n_estimators=500, subsample=0.5; total time=   0.0s
[CV] END criterion=friedman_mse, learning_rate=0.01, loss=log_loss, max_depth=5, n_estimators=1000, subsample=0.8; total time=   0.0

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 5, 10, 50, 80, None],
                                        'max_features': ['log2', 'sqrt'],
                                        'min_samples_leaf': [2, 4, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [10, 100, 500]},
                   random_state=42, verbose=2)

In [14]:
rf_random.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'log2',
 'max_depth': 50,
 'bootstrap': False}

In [15]:
params = {'n_estimators': 10,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

base_model = RandomForestClassifier(**params)
base_model.fit(X_train, y_train)
predict_base = base_model.predict(X_test)
base_accuracy = accuracy_score(predict_base, y_test)
base_accuracy

0.8098802395209581

In [16]:
best_random = rf_random.best_estimator_
best_random.fit(X_train, y_train)
predict_random = best_random.predict(X_test)
random_accuracy = accuracy_score(predict_random, y_test)
random_accuracy

0.8038922155688623

In [17]:
predict = best_random.predict(test_data)
output = pd.DataFrame({'PassengerId': test_data.index, 'Survived': predict})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
