In [1]:
# so far so good ..we have been training our model either the 
# extra random forest or the random forests with some sets of 
# parameters that will choose ourselves not knowing whether the 
# parameters are perfect or not..
# Therefore we use grid search which allows us to specify some
# set of parameters and the classifier will automatically run 
# various configuration to figure the best combination of parameters
# to produce the best model depending on the particular metrics
# we want to be concerned with

In [2]:
# importing packages
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import ensemble
from utilities import visualize_classifier
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# loading our data
input_file = 'data_random_forests.txt'

# reading our data
data = np.loadtxt(input_file, delimiter=',')
X,Y = data[:, :-1], data[:, -1]

In [5]:
# separating the data into classes based on their labels
Class_0 = np.array(X[Y==0])
Class_1 = np.array(X[Y==1])
Class_2 = np.array(X[Y==2])

In [6]:
# splitting the data into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=5)

In [7]:
print(len(X_train), len(Y_train))
print(len(X_test), len(Y_test))
print(len(X), len(Y))

675 675
225 225
900 900


In [15]:
# now we specify the grid parameters that we want the classifier to test
# usually we keep one parameter constant and vary the other parameter
# and then do it vice versa for the other parameter
parameter_grid = [{'n_estimators':[100], 'max_depth':[2,4,7,12,16]},
                  {'max_depth':[4],'n_estimators':[25,50,100,250]}]

# as i said earlier we define now the metrics that we want the 
# classifier to use to find the best combination of parameters
metrics = ['precision_weighted', 'recall_weighted']

# now for each metrics that we have, we need to run the grid search
# where we also train the classifier for a particular combination of parameters
class_names = ['class-0', 'class-1', 'class-2']

for metric in metrics:
    print('### searching the optimal parameters for', metric)
    # creating model classifier object with parameters
    classifier = GridSearchCV(ensemble.ExtraTreesClassifier(random_state=0),
                             parameter_grid, cv=5, scoring=metric)
    # training the model classifier
    classifier.fit(X_train, Y_train)
    # printing the score for each parameter combination
    for avg_score in classifier.cv_results_['mean_test_score']:
        for params in classifier.cv_results_['params']:
            print(params, '-->',round(avg_score,3))
        break
    print("\nBest parameters:", classifier.best_params_)
    # printing the performance report 
    Y_test_predict = classifier.predict(X_test)
    print("\nPerformance report:\n")
    print(classification_report(Y_test, Y_test_predict,target_names=class_names))


### searching the optimal parameters for precision_weighted
{'max_depth': 2, 'n_estimators': 100} --> 0.85
{'max_depth': 4, 'n_estimators': 100} --> 0.85
{'max_depth': 7, 'n_estimators': 100} --> 0.85
{'max_depth': 12, 'n_estimators': 100} --> 0.85
{'max_depth': 16, 'n_estimators': 100} --> 0.85
{'max_depth': 4, 'n_estimators': 25} --> 0.85
{'max_depth': 4, 'n_estimators': 50} --> 0.85
{'max_depth': 4, 'n_estimators': 100} --> 0.85
{'max_depth': 4, 'n_estimators': 250} --> 0.85

Best parameters: {'max_depth': 2, 'n_estimators': 100}

Performance report:

              precision    recall  f1-score   support

     class-0       0.94      0.81      0.87        79
     class-1       0.81      0.86      0.83        70
     class-2       0.83      0.91      0.87        76

    accuracy                           0.86       225
   macro avg       0.86      0.86      0.86       225
weighted avg       0.86      0.86      0.86       225

### searching the optimal parameters for recall_weighted
{