# Random forest classifier

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')
print(titanic.shape)
titanic.head()

In [None]:
titanic = titanic.drop(['Name'], axis=1) # drop the column that is named 'Name'
titanic.head()

In [None]:
# Turn Sex column into numbers
is_F = (titanic['Sex'] == 'female') # array of True and False
titanic['Sex'] = is_F.astype(int) # 1 = female, 0 = male
titanic.head()



In [None]:
y = pd.DataFrame(titanic['Survived'])
y

In [None]:
X = titanic.drop(columns=['Survived']) # drop 'Survived' column
X

In [None]:
y = y.squeeze() # to avoid sklearn warnings about y being a column vector

# Perform a standard train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# train the random forest on our training data
from sklearn import ensemble 
rf = ensemble.RandomForestClassifier(max_depth=4, n_estimators=5) 
rf.fit(X_train, y_train) # find the best f


In [None]:
print("Train score",rf.score(X_train, y_train))

In [None]:
print("Test score", rf.score(X_test, y_test))

In [None]:
from sklearn import tree
fig, axes = plt.subplots(nrows = 1,ncols = 5,figsize = (10,2), dpi=900)
for index in range(0, 5):
    tree.plot_tree(rf.estimators_[index],
                   feature_names = X.columns, 
                   filled = True,
                   ax = axes[index]);

# k-fold cross-validation

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True)

from sklearn.model_selection import cross_val_score

model = ensemble.RandomForestClassifier(max_depth=4, n_estimators=5)
scores = cross_val_score(model, X, y, cv=kf)
print(f"Cross-validation scores: {scores}")
print(f"Mean score: {scores.mean()}")

In [None]:
# hyperparameter search for max_depth given 5 trees
mean_scores=[]
for i in range(1,30):
    model = ensemble.RandomForestClassifier(max_depth=i, n_estimators=5)
    scores = cross_val_score(model, X, y, cv=kf)
    mean_scores.append(scores.mean())
sns.lineplot(x=range(1,30),y=mean_scores)


In [None]:
# hyperparameter search for number of trees given max_depth=7
mean_scores=[]
for i in range(1,50):
    model = ensemble.RandomForestClassifier(max_depth=5, n_estimators=i)
    scores = cross_val_score(model, X, y, cv=kf)
    mean_scores.append(scores.mean())
sns.lineplot(x=range(1,50),y=mean_scores)
