# Imports  

In [16]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re 
from math import isnan
import wandb
import random
from sklearn.ensemble import RandomForestClassifier

# utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [12]:
# dtype={'type': str} prevents being confused with data type for large data sets
train = pd.read_csv('data/train.csv', index_col='id', dtype={'type': str})
test = pd.read_csv('data/test.csv', index_col='id', dtype={'type': str})
train_translated = pd.read_csv('data/train_translated.csv', dtype={'type': str})
test_translated = pd.read_csv('data/test_translated.csv', index_col='id', dtype={'type': str})
combined_data = pd.read_csv('data/combined_data.csv', index_col='id', dtype={'type': str})
combined_data_translated = pd.read_csv('data/combined_data_translated.csv', index_col='id', dtype={'type': str})
combined_data_fully_translated = pd.read_csv('data/combined_data_fully_translated.csv', index_col='id', dtype={'type': str})
prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})
test_prep = pd.read_csv('data/test_prepared.csv', index_col='id', dtype={'type': str})
train_prep = pd.read_csv('data/train_prepared.csv', index_col='id', dtype={'type': str})

In [14]:
data = train_prep.copy()
features = data.drop('type', axis=1)
labels = data.type
# at least xgboost cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels = label_encoder.transform(labels)

X_train = features
y_train = labels

## Random Forest 

Parameters:

- n_estimators = number of trees in the foreset
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node
- bootstrap = method for sampling data points (with or without replacement)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

clf = RandomForestClassifier() #Initialize with whatever parameters you want to

# 10-Fold Cross validation
print (np.mean(cross_val_score(clf, X_train, y_train, cv=10))) 
#for a normal random forest average around 0.9031428571428572

# 5-Fold Cross validation
print (np.mean(cross_val_score(clf, X_train, y_train, cv=5))) 
#for a normal random forest average around 0.9008571428571429



0.9027857142857144
0.9008571428571429


## Grid search 

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [5, 10, 15, 20],
     'max_depth': [2, 5, 7, 9]
}

grid_clf = GridSearchCV(clf, param_grid, cv=10, scoring = 'accuracy')
grid_clf.fit(X_train, y_train)




In [21]:
#Best model
print(grid_clf.best_estimator_)

RandomForestClassifier(max_depth=9, n_estimators=20)


In [22]:
#Best parameters
print(grid_clf.best_params_)

{'max_depth': 9, 'n_estimators': 20}


In [24]:
#Grid scorces
print(grid_clf.cv_results_)

{'mean_fit_time': array([0.15180211, 0.1831285 , 0.21183178, 0.24119327, 0.18142056,
       0.24202259, 0.30372024, 0.36182663, 0.20425549, 0.28824861,
       0.36545801, 0.4506654 , 0.22790921, 0.33142521, 0.44677289,
       0.55276341]), 'std_fit_time': array([0.00850133, 0.00356891, 0.00612005, 0.00469836, 0.00541321,
       0.00569036, 0.00719503, 0.010366  , 0.00367265, 0.00636806,
       0.00696287, 0.00858385, 0.0071506 , 0.01810358, 0.02638251,
       0.02852474]), 'mean_score_time': array([0.01847818, 0.02072721, 0.02263041, 0.02522876, 0.01972516,
       0.02241356, 0.02501185, 0.02815332, 0.02036021, 0.02273295,
       0.02673388, 0.02927763, 0.0222477 , 0.0254185 , 0.02585604,
       0.02972991]), 'std_score_time': array([0.001784  , 0.00188431, 0.00077834, 0.00061566, 0.0007674 ,
       0.00065764, 0.00093918, 0.00095546, 0.0007885 , 0.00039895,
       0.00158197, 0.00193241, 0.00197696, 0.00485033, 0.00269201,
       0.00344858]), 'param_max_depth': masked_array(data=[2, 

In [30]:
#Grid scores
print(grid_clf.best_score_)
#0.7787142857142857 - so worse than without parameters 

0.7787142857142857


### Attempt with other parameters

In [31]:
param_grid = {
    'n_estimators': [20, 50, 100, 1000],
     'max_depth': [10, 100, 500, 100]
}

grid_clf = GridSearchCV(clf, param_grid, cv=10, scoring = 'accuracy')
grid_clf.fit(X_train, y_train)

print(grid_clf.best_estimator_)
print(grid_clf.best_score_)
#RandomForestClassifier(max_depth=100, n_estimators=1000)
#0.9042857142857142



RandomForestClassifier(max_depth=100, n_estimators=1000)
0.9042857142857142


In [None]:
param_grid = {
    'n_estimators': [1000, 2000],
     'max_depth': [100, 200, 500]
}

grid_clf = GridSearchCV(clf, param_grid, cv=10, scoring = 'accuracy')
grid_clf.fit(X_train, y_train)

print(grid_clf.best_estimator_)
print(grid_clf.best_score_)

