In [1]:
# ----------------------------------------------------------------------------
# Author        :    Vasileios Perifanis
# Affiliation   :    Euclid team, Democritus University of Thrace, Dept. of Electrical & Computer Engineering
# Created Date  :    03/2022
# version       :    1.0
# ---------------------------------------------------------------------------

In [2]:
import numpy as np # linear algebra
import pandas as pd # data analysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import preprocessing

from sklearn.model_selection import StratifiedKFold

import statistics

import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("datasets/titanic_train.csv", header=0) # read csv

data['Initial']=0
for i in data:
    data['Initial']=data.Name.str.extract('([A-Za-z]+)\.') #extract the Salutations

# Assigning the NaN Values the mean ages
data.loc[(data.Age.isnull())&(data.Initial=='Mr'),'Age']=33
data.loc[(data.Age.isnull())&(data.Initial=='Mrs'),'Age']=36
data.loc[(data.Age.isnull())&(data.Initial=='Master'),'Age']=5
data.loc[(data.Age.isnull())&(data.Initial=='Miss'),'Age']=22
data.loc[(data.Age.isnull())&(data.Initial=='Other'),'Age']=46

data['Embarked'].fillna('S',inplace=True)

data['Age_band']=0
data.loc[data['Age']<=16,'Age_band']=0
data.loc[(data['Age']>16)&(data['Age']<=32),'Age_band']=1
data.loc[(data['Age']>32)&(data['Age']<=48),'Age_band']=2
data.loc[(data['Age']>48)&(data['Age']<=64),'Age_band']=3
data.loc[data['Age']>64,'Age_band']=4

data['Family_Size']=0
data['Family_Size']=data['Parch']+data['SibSp']#family size
data['Alone']=0
data.loc[data.Family_Size==0,'Alone']=1#Alone

data['Fare_Range']=pd.qcut(data['Fare'],5)
data['Fare_cat']=0
data.loc[data['Fare']<=7.91,'Fare_cat']=0
data.loc[(data['Fare']>7.91)&(data['Fare']<=14.454),'Fare_cat']=1
data.loc[(data['Fare']>14.454)&(data['Fare']<=31),'Fare_cat']=2
data.loc[(data['Fare']>31)&(data['Fare']<=513),'Fare_cat']=3

data['Sex'].replace(['male','female'],[0,1],inplace=True)
data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data['Initial'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)

data.drop(['Name','Age','Ticket','Fare','Cabin','Initial','Fare_Range','PassengerId'],axis=1,inplace=True)

data.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Age_band,Family_Size,Alone,Fare_cat
0,0,3,0,1,0,0,1,1,0,0
1,1,1,1,1,0,1,2,1,0,3
2,1,3,1,0,0,0,1,0,1,1
3,1,1,1,1,0,0,2,1,0,3
4,0,3,0,0,0,0,2,0,1,1


In [4]:
def get_Xy_to_numpy(dataset):
    X = dataset[dataset.columns[1:]].to_numpy()
    y = dataset[dataset.columns[:1]].to_numpy().ravel()
    return X, y

In [5]:
def train_sk(train_X, train_Y, test_X, test_Y):
    # initialize classifier
    classifier = RandomForestClassifier()

    # train the classifier
    classifier.fit(train_X,train_Y)

    # make predictions on train/test set
    y_pred_train = classifier.predict(train_X)
    y_pred = classifier.predict(test_X)

    # calculate scores
    accuracy_train = accuracy_score(y_pred_train, train_Y)
    precision_train = precision_score(y_pred_train, train_Y)
    recall_train = recall_score(y_pred_train, train_Y)
    f1_train = f1_score(y_pred_train, train_Y)

    accuracy = accuracy_score(y_pred, test_Y)
    precision = precision_score(y_pred, test_Y)
    recall = recall_score(y_pred, test_Y)
    f1 = f1_score(y_pred, test_Y)

    print(f"[Training set] Accuracy: {accuracy_train:.4f}, Precision: {precision_train:.4f}, Recall: {recall_train:.4f}, F1: {f1_train:.4f}\n",
          f"[Testing set] Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    return accuracy, precision, recall, f1

In [6]:
def get_statistics(accuracy, precision, recall, f):
    print(f"Stdev:\n\tAcc: {statistics.stdev(accuracy):.4f}\n\tPre: {statistics.stdev(precision):.4f}\n\tRec: {statistics.stdev(recall):.4f}\n\tF1: {statistics.stdev(f):.4f}")
    print(f"Mean Scores:\n\tAcc: {statistics.mean(accuracy):.4f}\n\tPre: {statistics.mean(precision):.4f}\n\tRec: {statistics.mean(recall):.4f}\n\tF1: {statistics.mean(f):.4f}")
    return statistics.mean(f)

In [7]:
X, y = get_Xy_to_numpy(data)
min_max_scaler = preprocessing.MinMaxScaler().fit(X)
X = min_max_scaler.transform(X)

In [8]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

In [9]:
global_f1 = dict()
accuracy, precision, recall, f1 = dict(), dict(), dict(), dict()

In [10]:
for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    acc, prec, rec, f = train_sk(X_train, y_train, X_test, y_test)
    accuracy[i] = acc
    precision[i] = prec
    recall[i] = rec
    f1[i] = f
mean_f_score = get_statistics(list(accuracy.values()), list(precision.values()), list(recall.values()), list(f1.values()))
global_f1['baseline'] = mean_f_score

[Training set] Accuracy: 0.8864, Precision: 0.8208, Recall: 0.8750, F1: 0.8471
 [Testing set] Accuracy: 0.8333, Precision: 0.7429, Recall: 0.8125, F1: 0.7761
[Training set] Accuracy: 0.8890, Precision: 0.8279, Recall: 0.8763, F1: 0.8514
 [Testing set] Accuracy: 0.8427, Precision: 0.7059, Recall: 0.8571, F1: 0.7742
[Training set] Accuracy: 0.8915, Precision: 0.8084, Recall: 0.8989, F1: 0.8513
 [Testing set] Accuracy: 0.7528, Precision: 0.6471, Recall: 0.6875, F1: 0.6667
[Training set] Accuracy: 0.8903, Precision: 0.8149, Recall: 0.8901, F1: 0.8508
 [Testing set] Accuracy: 0.8539, Precision: 0.7941, Recall: 0.8182, F1: 0.8060
[Training set] Accuracy: 0.8878, Precision: 0.8149, Recall: 0.8838, F1: 0.8480
 [Testing set] Accuracy: 0.8090, Precision: 0.7941, Recall: 0.7297, F1: 0.7606
[Training set] Accuracy: 0.8890, Precision: 0.8019, Recall: 0.8982, F1: 0.8473
 [Testing set] Accuracy: 0.8202, Precision: 0.7647, Recall: 0.7647, F1: 0.7647
[Training set] Accuracy: 0.8953, Precision: 0.8442, 

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [12]:
# parameters we want to tune
parameters = {'n_estimators':[10, 20, 50, 80, 100],
              'criterion':['gini', 'entropy']}
# define the scoring metric
scorer = make_scorer(f1_score)
# define the classifier
model = RandomForestClassifier()

In [13]:
clf = GridSearchCV(model, parameters, scoring=scorer, cv=10)

In [14]:
clf.fit(X, y)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'n_estimators': [10, 20, 50, 80, 100]},
             scoring=make_scorer(f1_score))

In [15]:
clf.cv_results_

{'mean_fit_time': array([0.00802352, 0.01472573, 0.03640251, 0.05847037, 0.07314024,
        0.0076838 , 0.01507134, 0.03792696, 0.05964513, 0.07412183]),
 'std_fit_time': array([9.16721250e-04, 1.32175304e-04, 2.95929724e-04, 6.37609731e-04,
        7.56992927e-04, 5.09150454e-05, 1.69088460e-04, 7.27378081e-04,
        5.88511648e-04, 2.31921340e-04]),
 'mean_score_time': array([0.00106785, 0.00153849, 0.00303309, 0.00453663, 0.00559883,
        0.00103006, 0.00152731, 0.00306571, 0.00454609, 0.00555887]),
 'std_score_time': array([3.42887778e-05, 2.48077825e-05, 3.66895208e-05, 1.98443906e-05,
        1.14328448e-04, 6.58415256e-06, 6.96002597e-06, 6.44538734e-05,
        1.81921500e-05, 3.26282076e-05]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'gini', 'gini', 'entropy',
                    'entropy', 'entropy', 'entropy', 'entropy'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_val