# **6. Random Forests**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

In [2]:
def rf_classification(X_train, y_train, X_test, y_test, mode):
    # Split the training data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42)

    # Define the parameter grid to search over
    param_grid = {
        'n_estimators': [100, 200],  # Number of trees in the forest
        'max_depth': [20, 30],  # Maximum depth of the tree
    }

    # Create a Random Forest classifier object
    rf_model = RandomForestClassifier(n_jobs=-1)

    # Create a GridSearchCV object
    if mode == "BINARY":
        grid_search = GridSearchCV(
            rf_model, param_grid, scoring='f1', cv=3, refit=True, verbose=3)
    elif mode == "MULTICLASS":
        grid_search = GridSearchCV(
            rf_model, param_grid, scoring='f1_micro', cv=3, refit=True, verbose=3)

    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and score
    print("Best hyperparameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # Use the model to make predictions on the test data
    y_pred = grid_search.predict(X_test)

    # Get the precision and recall scores
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)

    # Get the F1 score
    f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' if you have imbalanced classes
    print("F1 Score:", f1)

    # Get the classification report
    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)

def evaluation(y_test, y_pred, type, message):

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    if type == 'BINARY':
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
    
    elif type == 'MULTICLASS':
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"{message}:")
    print("Accuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1 Score: {:.4f}".format(f1))
    print("Confusion Matrix: \n{}\n".format(conf_matrix))


In [3]:
wwcX_train = pd.read_csv('./train-test-split/white-win-loss/wwcX_train.csv')
wwcX_test = pd.read_csv('./train-test-split/white-win-loss/wwcX_test.csv')
wwcY_train = pd.read_csv('./train-test-split/white-win-loss/wwcY_train.csv')
wwcY_test = pd.read_csv('./train-test-split/white-win-loss/wwcY_test.csv')

wtcX_train = pd.read_csv('./train-test-split/white-termination/wtcX_train.csv')
wtcX_test = pd.read_csv('./train-test-split/white-termination/wtcX_test.csv')
wtcY_train = pd.read_csv('./train-test-split/white-termination/wtcY_train.csv')
wtcY_test = pd.read_csv('./train-test-split/white-termination/wtcY_test.csv')

bwcX_train = pd.read_csv('./train-test-split/black-win-loss/bwcX_train.csv')
bwcX_test = pd.read_csv('./train-test-split/black-win-loss/bwcX_test.csv')
bwcY_train = pd.read_csv('./train-test-split/black-win-loss/bwcY_train.csv')
bwcY_test = pd.read_csv('./train-test-split/black-win-loss/bwcY_test.csv')

btcX_train = pd.read_csv('./train-test-split/black-termination/btcX_train.csv')
btcX_test = pd.read_csv('./train-test-split/black-termination/btcX_test.csv')
btcY_train = pd.read_csv('./train-test-split/black-termination/btcY_train.csv')
btcY_test = pd.read_csv('./train-test-split/black-termination/btcY_test.csv')

## **White**

In [None]:
### White Win/Loss Classification
rf_wwc = RandomForestClassifier(n_estimators=100, random_state=42)
rf_wwc.fit(wwcX_train, wwcY_train)
wwcY_pred = rf_wwc.predict(wwcX_test)

print("White Win/Loss Classification Report:")
print(classification_report(wwcY_test, wwcY_pred))

### White Termination Classification
rf_wtc = RandomForestClassifier(n_estimators=100, random_state=42)
rf_wtc.fit(wtcX_train, wtcY_train)
wtcY_pred = rf_wtc.predict(wtcX_test)

print("White Termination Classification Report:")
print(classification_report(wtcY_test, wtcY_pred))

White Win/Loss Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81     79792
           1       0.81      0.82      0.81     82918

    accuracy                           0.81    162710
   macro avg       0.81      0.81      0.81    162710
weighted avg       0.81      0.81      0.81    162710

White Termination Classification Report:
              precision    recall  f1-score   support

           0       0.07      0.07      0.07     53123
           1       0.07      0.07      0.07     53396
           2       0.73      0.60      0.66     56191

    accuracy                           0.26    162710
   macro avg       0.29      0.25      0.27    162710
weighted avg       0.29      0.26      0.27    162710



In [None]:
### White Win/Loss Classification
evaluation(wwcY_test, wwcY_pred, "BINARY", "White Win/Loss Classification Report")

### White Termination Classification
evaluation(wtcY_test, wtcY_pred, "MULTICLASS", "White Termination Classification Report")

White Win/Loss Classification Report:
Accuracy: 0.8107
Precision: 0.8117
Recall: 0.8183
F1 Score: 0.8150
Confusion Matrix: 
[[64049 15743]
 [15065 67853]]

White Termination Classification Report:
Accuracy: 0.2555
Precision: 0.2949
Recall: 0.2555
F1 Score: 0.2730
Confusion Matrix: 
[[ 3925 42872  6326]
 [43286  3790  6320]
 [11158 11180 33853]]



## **Black**

In [None]:
### Black Win/Loss Classification
rf_bwc = RandomForestClassifier(n_estimators=100, random_state=42)
rf_bwc.fit(bwcX_train, bwcY_train)
bwcY_pred = rf_bwc.predict(bwcX_test)

print("Win/Loss Classification Report:")
print(classification_report(bwcY_test, bwcY_pred))

### Black Termination Classification
rf_btc = RandomForestClassifier(n_estimators=100, random_state=42)
rf_btc.fit(btcX_train, btcY_train)
btcY_pred = rf_btc.predict(btcX_test)

print("Termination Classification Report:")
print(classification_report(btcY_test, btcY_pred))

Win/Loss Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.82      0.81     83059
           1       0.81      0.80      0.80     79651

    accuracy                           0.81    162710
   macro avg       0.81      0.81      0.81    162710
weighted avg       0.81      0.81      0.81    162710

Termination Classification Report:
              precision    recall  f1-score   support

           0       0.07      0.07      0.07     53123
           1       0.07      0.07      0.07     53396
           2       0.72      0.60      0.65     56191

    accuracy                           0.25    162710
   macro avg       0.29      0.25      0.27    162710
weighted avg       0.29      0.25      0.27    162710



In [None]:
### Black Win/Loss Classification
evaluation(bwcY_test, bwcY_pred, "BINARY", "Black Win/Loss Classification Report")

### Black Termination Classification
evaluation(btcY_test, btcY_pred, "MULTICLASS", "Black Termination Classification Report")

Black Win/Loss Classification Report:
Accuracy: 0.8085
Precision: 0.8079
Recall: 0.7987
F1 Score: 0.8033
Confusion Matrix: 
[[67933 15126]
 [16030 63621]]

Black Termination Classification Report:
Accuracy: 0.2544
Precision: 0.2940
Recall: 0.2544
F1 Score: 0.2721
Confusion Matrix: 
[[ 3919 42789  6415]
 [43093  3906  6397]
 [11249 11366 33576]]



# Hyperparameter Tuning for Random Forests

## **White**

### 1) White Win/Loss Classification

In [4]:
print("White Win-Loss Classification")
rf_classification(wwcX_train, wwcY_train['W'], wwcX_test, wwcY_test['W'], "BINARY")

White Win-Loss Classification
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END ....max_depth=20, n_estimators=100;, score=0.738 total time=  14.5s
[CV 2/3] END ....max_depth=20, n_estimators=100;, score=0.736 total time=  13.5s
[CV 3/3] END ....max_depth=20, n_estimators=100;, score=0.737 total time=  13.5s
[CV 1/3] END ....max_depth=20, n_estimators=200;, score=0.739 total time=  26.8s
[CV 2/3] END ....max_depth=20, n_estimators=200;, score=0.736 total time=  26.8s
[CV 3/3] END ....max_depth=20, n_estimators=200;, score=0.738 total time=  26.9s
[CV 1/3] END ....max_depth=30, n_estimators=100;, score=0.746 total time=  16.2s
[CV 2/3] END ....max_depth=30, n_estimators=100;, score=0.745 total time=  16.3s
[CV 3/3] END ....max_depth=30, n_estimators=100;, score=0.747 total time=  16.4s
[CV 1/3] END ....max_depth=30, n_estimators=200;, score=0.747 total time=  32.1s
[CV 2/3] END ....max_depth=30, n_estimators=200;, score=0.745 total time=  31.8s
[CV 3/3] END ....ma

### 2) White Termination Classification

In [5]:
print("White Termination Classification")
rf_classification(wtcX_train, wtcY_train['Termination'], wtcX_test, wtcY_test['Termination'], "MULTICLASS")

White Termination Classification
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END ....max_depth=20, n_estimators=100;, score=0.354 total time=  13.9s
[CV 2/3] END ....max_depth=20, n_estimators=100;, score=0.351 total time=  13.9s
[CV 3/3] END ....max_depth=20, n_estimators=100;, score=0.355 total time=  13.9s
[CV 1/3] END ....max_depth=20, n_estimators=200;, score=0.352 total time=  27.6s
[CV 2/3] END ....max_depth=20, n_estimators=200;, score=0.350 total time=  27.5s
[CV 3/3] END ....max_depth=20, n_estimators=200;, score=0.352 total time=  27.7s
[CV 1/3] END ....max_depth=30, n_estimators=100;, score=0.321 total time=  19.6s
[CV 2/3] END ....max_depth=30, n_estimators=100;, score=0.320 total time=  19.5s
[CV 3/3] END ....max_depth=30, n_estimators=100;, score=0.320 total time=  19.6s
[CV 1/3] END ....max_depth=30, n_estimators=200;, score=0.322 total time=  38.7s
[CV 2/3] END ....max_depth=30, n_estimators=200;, score=0.321 total time=  38.9s
[CV 3/3] END ...

## **Black**

### 1) Black Win/Loss Classification

In [6]:
print("Black Win-Loss Classification")
rf_classification(bwcX_train, bwcY_train['B'], bwcX_test, bwcY_test['B'], "BINARY")

Black Win-Loss Classification
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END ....max_depth=20, n_estimators=100;, score=0.718 total time=  13.0s
[CV 2/3] END ....max_depth=20, n_estimators=100;, score=0.719 total time=  13.2s
[CV 3/3] END ....max_depth=20, n_estimators=100;, score=0.717 total time=  13.2s
[CV 1/3] END ....max_depth=20, n_estimators=200;, score=0.719 total time=  25.9s
[CV 2/3] END ....max_depth=20, n_estimators=200;, score=0.719 total time=  25.8s
[CV 3/3] END ....max_depth=20, n_estimators=200;, score=0.719 total time=  25.9s
[CV 1/3] END ....max_depth=30, n_estimators=100;, score=0.728 total time=  15.9s
[CV 2/3] END ....max_depth=30, n_estimators=100;, score=0.730 total time=  16.0s
[CV 3/3] END ....max_depth=30, n_estimators=100;, score=0.732 total time=  16.0s
[CV 1/3] END ....max_depth=30, n_estimators=200;, score=0.727 total time=  31.3s
[CV 2/3] END ....max_depth=30, n_estimators=200;, score=0.732 total time=  31.5s
[CV 3/3] END ....ma

### 2) Black Termination Classification

In [7]:
print("Black Termination Classification")
rf_classification(btcX_train, btcY_train['Termination'], btcX_test, btcY_test['Termination'], "MULTICLASS")

Black Termination Classification
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END ....max_depth=20, n_estimators=100;, score=0.353 total time=  14.1s
[CV 2/3] END ....max_depth=20, n_estimators=100;, score=0.352 total time=  13.8s
[CV 3/3] END ....max_depth=20, n_estimators=100;, score=0.354 total time=  13.9s
[CV 1/3] END ....max_depth=20, n_estimators=200;, score=0.352 total time=  27.3s
[CV 2/3] END ....max_depth=20, n_estimators=200;, score=0.352 total time=  27.8s
[CV 3/3] END ....max_depth=20, n_estimators=200;, score=0.352 total time=  27.7s
[CV 1/3] END ....max_depth=30, n_estimators=100;, score=0.320 total time=  19.5s
[CV 2/3] END ....max_depth=30, n_estimators=100;, score=0.320 total time=  19.7s
[CV 3/3] END ....max_depth=30, n_estimators=100;, score=0.320 total time=  19.6s
[CV 1/3] END ....max_depth=30, n_estimators=200;, score=0.320 total time=  38.9s
[CV 2/3] END ....max_depth=30, n_estimators=200;, score=0.320 total time=  38.5s
[CV 3/3] END ...

Actually Win/Loss Classification performance got lower after hyperparameter tuning, but Termination Classification performance has improved.