In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#split data into train-test sets
train_data = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')

In [None]:
#remane the feature matrix as X and labels as y 
X = train_data.drop(columns = ['Exited'], axis = 1)
y = train_data['Exited']

In [None]:
# drop 'id' 'customerId' and 'surname' from X as they are unlikely to corrleate with y
X = X.drop(['id', 'CustomerId', 'Surname'], axis = 1)

#Encode categorical data suitably
X['Geography'].replace(['France', 'Germany', 'Spain'],
                        [-1, 0, 1], inplace=True)
X['Gender'].replace(['Male', 'Female'],
                        [1, -1], inplace=True)
X.head()

In [None]:
from sklearn.model_selection import train_test_split

Xt,Xv, yt, yv = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify=y)

In [None]:
# select random forest classifier for prediction
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(class_weight = 'balanced')

params = RFC.get_params()
params_df = pd.DataFrame(params, index = [0])
params_df.T

In [None]:
# hyperparameter optimization with optuna
# scoring meteric used is 'roc_auc + recall'
import optuna
from sklearn.model_selection import StratifiedKFold, cross_validate

def rf_obj(trial):
    params = {
    'max_depth' : trial.suggest_int('max_depth', 3, 20),
    'n_estimators' : trial.suggest_int('n_estimators', 200, 1000),
    'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 2, 10),
    "n_jobs": -1,
    'max_samples' : trial.suggest_int('max_samples', 25000, 75000),
    'class_weight': 'balanced'   
    }
    scoring = ['roc_auc', 'recall']
    rf_c = RandomForestClassifier(**params)
    
    kfold = StratifiedKFold(n_splits = 3, shuffle=True, random_state=42)
    
    score = cross_validate(rf_c, Xt, yt, scoring = scoring, cv = kfold, n_jobs = -1)
    
    return score['test_roc_auc'].mean(), score['test_recall'].mean()

study = optuna.create_study(directions = ['maximize', 'maximize'])

study.optimize(rf_obj, n_trials = 25, n_jobs = -1)

df2 = study.trials_dataframe(multi_index=True)

In [None]:
df2

In [None]:
study.best_trials

In [None]:
# refit the model using best parameters from the optuna study
params={'max_depth': 7, 'n_estimators': 969, 'min_samples_leaf': 2, 'max_samples': 46997}
RFC = RandomForestClassifier(**params, n_jobs = -1, class_weight = 'balanced')

params = RFC.get_params()
params_df = pd.DataFrame(params, index = [0])
params_df.T

In [None]:
RFC.fit(Xt, yt)

#predict 
pred = RFC.predict(Xv)
prob = RFC.predict_proba(Xv)[:,1]

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

#compute the accuracy of the model
print(f'AUC_ROC: {roc_auc_score(yv, prob)}')
print(f'{classification_report(yv, pred)}')
print(confusion_matrix(yv, pred))

In [None]:
params=params={'max_depth': 11, 'n_estimators': 801, 'min_samples_leaf': 4, 'max_samples': 66916}
RFC2 = RandomForestClassifier(**params, n_jobs = -1, class_weight = 'balanced')

params = RFC2.get_params()
params_df = pd.DataFrame(params, index = [0])
params_df.T

In [None]:
RFC2.fit(Xt, yt)
pred = RFC2.predict(Xv)
prob = RFC2.predict_proba(Xv)[:,1]

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

print(f'AUC_ROC: {roc_auc_score(yv, prob)}')
print(f'{classification_report(yv, pred)}')
print(confusion_matrix(yv, pred))

In [None]:
test_data2 = test_data.drop(['id', 'CustomerId', 'Surname'], axis = 1)
test_data2['Geography'].replace(['France', 'Germany', 'Spain'],
                        [-1, 0, 1], inplace=True)
test_data2['Gender'].replace(['Male', 'Female'],
                        [1, -1], inplace=True)
test_data2.head()

In [None]:
X = train_data.drop(['id', 'CustomerId', 'Surname', 'Exited'], axis = 1)
X['Geography'].replace(['France', 'Germany', 'Spain'],
                        [-1, 0, 1], inplace=True)
X['Gender'].replace(['Male', 'Female'],
                        [1, -1], inplace=True)
y = train_data['Exited']
X.head()

In [None]:
RFC.fit(X,y)

#predict the test data
predictions = RFC.predict_proba(test_data2)[:,1]


In [None]:
#submit
output = pd.DataFrame({'id': test_data.id, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")