In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import copy
import random

# Read the CSV file into a DataFrame
data = pd.read_csv('sample_data/Final_Data_NFL.csv')

# Creating a new DataFrame with all columns except 'column2' and 'column4'
columns_to_exclude = ['Day', 'Date', 'Time', 'Favorite', 'FavoriteTeamYear', 'UnderdogTeamYear',
                      'Score', 'Underdog', 'OverUnder', 'Season_Year', 'Favorite_Covered']
X = data.drop(columns=columns_to_exclude)
Y = data["Favorite_Covered"]

# Convert time of posession columns to numbers
for col_name in ['Favoriteaverage-time-of-possession-net-of-ot', 'Favoriteopponent-average-time-of-possession-net-of-ot', 'Underdogaverage-time-of-possession-net-of-ot', 'Underdogopponent-average-time-of-possession-net-of-ot']:
  col_data = X[col_name]
  new_col_data = []
  for row in col_data:
    split_data = row.split(':')
    col_data = 60*int(split_data[0]) + int(split_data[1])
    new_col_data.append(col_data)
  X[col_name] = new_col_data

# Convert % to float
for col_name in [
    "Favoritered-zone-scoring-pct", "Favoritethird-down-conversion-pct", "Favoritefield-goal-conversion-pct",
    "Favoriteopponent-red-zone-scoring-pct", "Favoriteopponent-third-down-conversion-pct", "Favoriteopponent-completion-pct",
    "Favoriteopponent-field-goal-conversion-pct", "Underdogred-zone-scoring-pct", "Underdogthird-down-conversion-pct",
    "Underdogfield-goal-conversion-pct", "Underdogopponent-red-zone-scoring-pct", "Underdogopponent-third-down-conversion-pct",
    "Underdogopponent-completion-pct", "Underdogopponent-field-goal-conversion-pct"
]:
  col_data = X[col_name]
  new_col_data = []
  for row in col_data:
    col_data = row[0:-1]
    new_col_data.append(col_data)
  X[col_name] = new_col_data

# If there is not a reported spread, get rid of the row of data
indices = []
for i, row in enumerate(X['Spread']):
  if str(row) == 'nan':
    indices.append(i)
X = X.drop(indices)
Y = Y.drop(indices)


# If there is a push and both sides of bet get their money back -set the label to P
new_Y = []
for row in Y:
  if row == 'T' or row == 'F':
    val = row
  else:
    val = 'T'
  new_Y.append(val)
Y = pd.Series(new_Y)

# Splitting the data into training and testing sets(70%, 15%, 15%) - 5 times
# We will choose the most classified label provided by the 5 random forests when making predictions
X_temp1, X_test, y_temp1, y_test= train_test_split(X, Y, test_size=0.15, random_state=1)
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=1)
X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=2)
X_train3, X_valid3, y_train3, y_valid3 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=3)
X_train4, X_valid4, y_train4, y_valid4 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=4)
X_train5, X_valid5, y_train5, y_valid5 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=5)

best_valid_accuracy = 0
best_classifier1 = None
best_classifier2 = None
best_classifier3 = None
best_classifier4 = None
best_classifier5 = None
best_criterion = None
best_min_samples_split = None
best_max_features = None
# Creating a Random Forest classifier - based on 100 decision trees
criterion_iter = -1
for criterion in ["gini", "entropy"]:
  criterion_iter += 1
  for min_samples_split in range(2, 500, 5):
    print("Progress =", round(min_samples_split/1000 * 100 + (criterion_iter*50),2), "%")
    for max_features in [10, 20, 30, 40, 50, "sqrt"]:
      rf_classifier1 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=1)
      rf_classifier2 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=2)
      rf_classifier3 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=3)
      rf_classifier4 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=4)
      rf_classifier5 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=5)

      # Training the classifier
      rf_classifier1.fit(X_train1, y_train1)
      rf_classifier2.fit(X_train2, y_train2)
      rf_classifier3.fit(X_train3, y_train3)
      rf_classifier4.fit(X_train4, y_train4)
      rf_classifier5.fit(X_train5, y_train5)

      # Predicting on the valid set
      predictions_valid1 = rf_classifier1.predict(X_valid1)
      predictions_valid2 = rf_classifier2.predict(X_valid2)
      predictions_valid3 = rf_classifier3.predict(X_valid3)
      predictions_valid4 = rf_classifier4.predict(X_valid4)
      predictions_valid5 = rf_classifier5.predict(X_valid5)

      # Calculating accuracy - valid
      valid_accuracy1 = accuracy_score(y_valid1, predictions_valid1)
      valid_accuracy2 = accuracy_score(y_valid2, predictions_valid2)
      valid_accuracy3 = accuracy_score(y_valid3, predictions_valid3)
      valid_accuracy4 = accuracy_score(y_valid4, predictions_valid4)
      valid_accuracy5 = accuracy_score(y_valid5, predictions_valid5)
      valid_accuracy = (valid_accuracy1 + valid_accuracy2 + valid_accuracy3 + valid_accuracy4 + valid_accuracy5)/5

      if valid_accuracy > best_valid_accuracy:
        best_valid_accuracy = valid_accuracy
        best_classifier1 = copy.deepcopy(rf_classifier1)
        best_classifier2 = copy.deepcopy(rf_classifier2)
        best_classifier3 = copy.deepcopy(rf_classifier3)
        best_classifier4 = copy.deepcopy(rf_classifier4)
        best_classifier5 = copy.deepcopy(rf_classifier5)
        best_criterion = criterion
        best_min_samples_split = min_samples_split
        best_max_features = max_features

# Predicting on the train set
predictions_train1 = rf_classifier1.predict(X_train1)
predictions_train2 = rf_classifier2.predict(X_train2)
predictions_train3 = rf_classifier3.predict(X_train3)
predictions_train4 = rf_classifier4.predict(X_train4)
predictions_train5 = rf_classifier5.predict(X_train5)

train_accuracy1 = accuracy_score(y_train1, predictions_train1)
train_accuracy2 = accuracy_score(y_train2, predictions_train2)
train_accuracy3 = accuracy_score(y_train3, predictions_train3)
train_accuracy4 = accuracy_score(y_train4, predictions_train4)
train_accuracy5 = accuracy_score(y_train5, predictions_train5)
train_accuracy = (train_accuracy1 + train_accuracy2 + train_accuracy3 + train_accuracy4 + train_accuracy5)/5
print(f"Avg Train Accuracy: {train_accuracy}")

# Predicting on the valid set
predictions_valid1 = rf_classifier1.predict(X_valid1)
predictions_valid2 = rf_classifier2.predict(X_valid2)
predictions_valid3 = rf_classifier3.predict(X_valid3)
predictions_valid4 = rf_classifier4.predict(X_valid4)
predictions_valid5 = rf_classifier5.predict(X_valid5)

valid_accuracy1 = accuracy_score(y_valid1, predictions_valid1)
valid_accuracy2 = accuracy_score(y_valid2, predictions_valid2)
valid_accuracy3 = accuracy_score(y_valid3, predictions_valid3)
valid_accuracy4 = accuracy_score(y_valid4, predictions_valid4)
valid_accuracy5 = accuracy_score(y_valid5, predictions_valid5)
valid_accuracy = (valid_accuracy1 + valid_accuracy2 + valid_accuracy3 + valid_accuracy4 + valid_accuracy5)/5
print(f"Avg Valid Accuracy: {valid_accuracy}")

# Predicting on the test set
predictions_test1 = rf_classifier1.predict(X_test)
predictions_test2 = rf_classifier2.predict(X_test)
predictions_test3 = rf_classifier3.predict(X_test)
predictions_test4 = rf_classifier4.predict(X_test)
predictions_test5 = rf_classifier5.predict(X_test)

test_accuracy1 = accuracy_score(y_test, predictions_test1)
test_accuracy2 = accuracy_score(y_test, predictions_test2)
test_accuracy3 = accuracy_score(y_test, predictions_test3)
test_accuracy4 = accuracy_score(y_test, predictions_test4)
test_accuracy5 = accuracy_score(y_test, predictions_test5)
test_accuracy = (test_accuracy1 + valid_accuracy2 + valid_accuracy3 + valid_accuracy4 + valid_accuracy5)/5
print(f"Avg Test Accuracy: {test_accuracy}")

Progress = 0.2 %
Progress = 0.7 %
Progress = 1.2 %
Progress = 1.7 %
Progress = 2.2 %
Progress = 2.7 %
Progress = 3.2 %
Progress = 3.7 %
Progress = 4.2 %
Progress = 4.7 %
Progress = 5.2 %
Progress = 5.7 %
Progress = 6.2 %
Progress = 6.7 %
Progress = 7.2 %
Progress = 7.7 %
Progress = 8.2 %
Progress = 8.7 %
Progress = 9.2 %
Progress = 9.7 %
Progress = 10.2 %
Progress = 10.7 %
Progress = 11.2 %
Progress = 11.7 %
Progress = 12.2 %
Progress = 12.7 %
Progress = 13.2 %
Progress = 13.7 %
Progress = 14.2 %
Progress = 14.7 %
Progress = 15.2 %
Progress = 15.7 %
Progress = 16.2 %
Progress = 16.7 %
Progress = 17.2 %
Progress = 17.7 %
Progress = 18.2 %
Progress = 18.7 %
Progress = 19.2 %
Progress = 19.7 %
Progress = 20.2 %
Progress = 20.7 %
Progress = 21.2 %
Progress = 21.7 %
Progress = 22.2 %
Progress = 22.7 %
Progress = 23.2 %
Progress = 23.7 %
Progress = 24.2 %
Progress = 24.7 %
Progress = 25.2 %
Progress = 25.7 %
Progress = 26.2 %
Progress = 26.7 %
Progress = 27.2 %
Progress = 27.7 %
Progress = 2

TypeError: ignored

In [None]:
pooled_predictions = []
for i in range(len(predictions_test1)):
  predicted_true = 0
  predicted_false = 0
  for prediction_list in [predictions_test1, predictions_test2, predictions_test3, predictions_test4, predictions_test5]:
    if prediction_list[i] == 'T':
       predicted_true += 1
    else:
      predicted_false += 1
  if predicted_true > predicted_false:
    pooled_predictions.append('T')
  else:
    pooled_predictions.append('F')

pooled_test_accuracy = accuracy_score(y_test, pooled_predictions)
print(f"Test Accuracy With Pooling: {pooled_test_accuracy}")

print("Criterion:", best_criterion)
print("Min Samples To Split:", best_min_samples_split)
print("Max Features:", best_max_features)

Test Accuracy With Pooling: 0.5721925133689839
Criterion: gini
Min Samples To Split: 77
Max Features: 50


In [None]:
random_list = [random.choice(["T", "F"]) for _ in range(len(y_test))]
test_accuracy = accuracy_score(y_test, random_list)
print(f"Accuracy: {test_accuracy}")

Accuracy: 0.48128342245989303


In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import copy

# Read the CSV file into a DataFrame
data = pd.read_csv('sample_data/Final_Data_NFL.csv')

# Creating a new DataFrame with all columns except 'column2' and 'column4'
columns_to_exclude = ['Day', 'Date', 'Time', 'Favorite', 'FavoriteTeamYear', 'UnderdogTeamYear',
                      'Score', 'Underdog', 'OverUnder', 'Season_Year', 'Favorite_Covered']
X = data.drop(columns=columns_to_exclude)
Y = data["Favorite_Covered"]

# Convert time of posession columns to numbers
for col_name in ['Favoriteaverage-time-of-possession-net-of-ot', 'Favoriteopponent-average-time-of-possession-net-of-ot', 'Underdogaverage-time-of-possession-net-of-ot', 'Underdogopponent-average-time-of-possession-net-of-ot']:
  col_data = X[col_name]
  new_col_data = []
  for row in col_data:
    split_data = row.split(':')
    col_data = 60*int(split_data[0]) + int(split_data[1])
    new_col_data.append(col_data)
  X[col_name] = new_col_data

# Convert % to float
for col_name in [
    "Favoritered-zone-scoring-pct", "Favoritethird-down-conversion-pct", "Favoritefield-goal-conversion-pct",
    "Favoriteopponent-red-zone-scoring-pct", "Favoriteopponent-third-down-conversion-pct", "Favoriteopponent-completion-pct",
    "Favoriteopponent-field-goal-conversion-pct", "Underdogred-zone-scoring-pct", "Underdogthird-down-conversion-pct",
    "Underdogfield-goal-conversion-pct", "Underdogopponent-red-zone-scoring-pct", "Underdogopponent-third-down-conversion-pct",
    "Underdogopponent-completion-pct", "Underdogopponent-field-goal-conversion-pct"
]:
  col_data = X[col_name]
  new_col_data = []
  for row in col_data:
    col_data = row[0:-1]
    new_col_data.append(col_data)
  X[col_name] = new_col_data

# If there is not a reported spread, get rid of the row of data
indices = []
for i, row in enumerate(X['Spread']):
  if str(row) == 'nan':
    indices.append(i)
X = X.drop(indices)
Y = Y.drop(indices)


# If there is a push and both sides of bet get their money back -set the label to P
new_Y = []
for row in Y:
  if row == 'T' or row == 'F':
    val = row
  else:
    val = 'T'
  new_Y.append(val)
Y = pd.Series(new_Y)

# Splitting the data into training and testing sets(70%, 15%, 15%) - 5 times
# We will choose the most classified label provided by the 5 random forests when making predictions
X_temp1, X_test, y_temp1, y_test= train_test_split(X, Y, test_size=0.15, random_state=1)
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=1)
X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=2)
X_train3, X_valid3, y_train3, y_valid3 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=3)
X_train4, X_valid4, y_train4, y_valid4 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=4)
X_train5, X_valid5, y_train5, y_valid5 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=5)
X_train6, X_valid6, y_train6, y_valid6 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=6)
X_train7, X_valid7, y_train7, y_valid7 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=7)
X_train8, X_valid8, y_train8, y_valid8 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=8)
X_train9, X_valid9, y_train9, y_valid9 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=9)
X_train10, X_valid10, y_train10, y_valid10 = train_test_split(X_temp1, y_temp1, test_size=0.176, random_state=10)

best_valid_accuracy = 0
best_classifier1 = None
best_classifier2 = None
best_classifier3 = None
best_classifier4 = None
best_classifier5 = None
best_classifier6 = None
best_classifier7 = None
best_classifier8 = None
best_classifier9 = None
best_classifier10 = None
best_criterion = None
best_min_samples_split = None
best_max_features = None
# Creating a Random Forest classifier - based on 100 decision trees
criterion_iter = -1
for criterion in ["gini", "entropy"]:
  criterion_iter += 1
  for min_samples_split in range(2, 200, 5):
    print("Progress =", round(min_samples_split/400 * 100 + (criterion_iter*50),2), "%")
    for max_features in [10, 20, 30, 40, 50, "sqrt"]:
      rf_classifier1 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=1)
      rf_classifier2 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=2)
      rf_classifier3 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=3)
      rf_classifier4 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=4)
      rf_classifier5 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=5)
      rf_classifier6 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=6)
      rf_classifier7 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=7)
      rf_classifier8 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=8)
      rf_classifier9 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=9)
      rf_classifier10 = RandomForestClassifier(n_estimators=20, criterion = criterion, min_samples_split = min_samples_split, max_features = max_features, random_state=10)

      # Training the classifier
      rf_classifier1.fit(X_train1, y_train1)
      rf_classifier2.fit(X_train2, y_train2)
      rf_classifier3.fit(X_train3, y_train3)
      rf_classifier4.fit(X_train4, y_train4)
      rf_classifier5.fit(X_train5, y_train5)
      rf_classifier6.fit(X_train6, y_train6)
      rf_classifier7.fit(X_train7, y_train7)
      rf_classifier8.fit(X_train8, y_train8)
      rf_classifier9.fit(X_train9, y_train9)
      rf_classifier10.fit(X_train10, y_train10)

      # Predicting on the valid set
      predictions_valid1 = rf_classifier1.predict(X_valid1)
      predictions_valid2 = rf_classifier2.predict(X_valid2)
      predictions_valid3 = rf_classifier3.predict(X_valid3)
      predictions_valid4 = rf_classifier4.predict(X_valid4)
      predictions_valid5 = rf_classifier5.predict(X_valid5)
      predictions_valid6 = rf_classifier5.predict(X_valid6)
      predictions_valid7 = rf_classifier5.predict(X_valid7)
      predictions_valid8 = rf_classifier5.predict(X_valid8)
      predictions_valid9 = rf_classifier5.predict(X_valid9)
      predictions_valid10 = rf_classifier5.predict(X_valid10)

      # Calculating accuracy - valid
      valid_accuracy1 = accuracy_score(y_valid1, predictions_valid1)
      valid_accuracy2 = accuracy_score(y_valid2, predictions_valid2)
      valid_accuracy3 = accuracy_score(y_valid3, predictions_valid3)
      valid_accuracy4 = accuracy_score(y_valid4, predictions_valid4)
      valid_accuracy5 = accuracy_score(y_valid5, predictions_valid5)
      valid_accuracy6 = accuracy_score(y_valid6, predictions_valid6)
      valid_accuracy7 = accuracy_score(y_valid7, predictions_valid7)
      valid_accuracy8 = accuracy_score(y_valid8, predictions_valid8)
      valid_accuracy9 = accuracy_score(y_valid9, predictions_valid9)
      valid_accuracy10 = accuracy_score(y_valid10, predictions_valid10)
      valid_accuracy = (valid_accuracy1 + valid_accuracy2 + valid_accuracy3 + valid_accuracy4 + valid_accuracy5 + valid_accuracy6 + valid_accuracy7 + valid_accuracy8 + valid_accuracy9 + valid_accuracy10)/10

      if valid_accuracy > best_valid_accuracy:
        best_valid_accuracy = valid_accuracy
        best_classifier1 = copy.deepcopy(rf_classifier1)
        best_classifier2 = copy.deepcopy(rf_classifier2)
        best_classifier3 = copy.deepcopy(rf_classifier3)
        best_classifier4 = copy.deepcopy(rf_classifier4)
        best_classifier5 = copy.deepcopy(rf_classifier5)
        best_classifier6 = copy.deepcopy(rf_classifier6)
        best_classifier7 = copy.deepcopy(rf_classifier7)
        best_classifier8 = copy.deepcopy(rf_classifier8)
        best_classifier9 = copy.deepcopy(rf_classifier9)
        best_classifier10 = copy.deepcopy(rf_classifier10)

        best_criterion = criterion
        best_min_samples_split = min_samples_split
        best_max_features = max_features

# Predicting on the train set
predictions_train1 = rf_classifier1.predict(X_train1)
predictions_train2 = rf_classifier2.predict(X_train2)
predictions_train3 = rf_classifier3.predict(X_train3)
predictions_train4 = rf_classifier4.predict(X_train4)
predictions_train5 = rf_classifier5.predict(X_train5)
predictions_train6 = rf_classifier6.predict(X_train6)
predictions_train7 = rf_classifier7.predict(X_train7)
predictions_train8 = rf_classifier8.predict(X_train8)
predictions_train9 = rf_classifier9.predict(X_train9)
predictions_train10 = rf_classifier10.predict(X_train10)

train_accuracy1 = accuracy_score(y_train1, predictions_train1)
train_accuracy2 = accuracy_score(y_train2, predictions_train2)
train_accuracy3 = accuracy_score(y_train3, predictions_train3)
train_accuracy4 = accuracy_score(y_train4, predictions_train4)
train_accuracy5 = accuracy_score(y_train5, predictions_train5)
train_accuracy6 = accuracy_score(y_train6, predictions_train6)
train_accuracy7 = accuracy_score(y_train7, predictions_train7)
train_accuracy8 = accuracy_score(y_train8, predictions_train8)
train_accuracy9 = accuracy_score(y_train9, predictions_train9)
train_accuracy10 = accuracy_score(y_train10, predictions_train10)
train_accuracy = (train_accuracy1 + train_accuracy2 + train_accuracy3 + train_accuracy4 + train_accuracy5 + train_accuracy6 + train_accuracy7 + train_accuracy8 + train_accuracy9 + train_accuracy10)/10
print(f"Avg Train Accuracy: {train_accuracy}")

# Predicting on the valid set
predictions_valid1 = rf_classifier1.predict(X_valid1)
predictions_valid2 = rf_classifier2.predict(X_valid2)
predictions_valid3 = rf_classifier3.predict(X_valid3)
predictions_valid4 = rf_classifier4.predict(X_valid4)
predictions_valid5 = rf_classifier5.predict(X_valid5)
predictions_valid6 = rf_classifier5.predict(X_valid6)
predictions_valid7 = rf_classifier5.predict(X_valid7)
predictions_valid8 = rf_classifier5.predict(X_valid8)
predictions_valid9 = rf_classifier5.predict(X_valid9)
predictions_valid10 = rf_classifier5.predict(X_valid10)

valid_accuracy1 = accuracy_score(y_valid1, predictions_valid1)
valid_accuracy2 = accuracy_score(y_valid2, predictions_valid2)
valid_accuracy3 = accuracy_score(y_valid3, predictions_valid3)
valid_accuracy4 = accuracy_score(y_valid4, predictions_valid4)
valid_accuracy5 = accuracy_score(y_valid5, predictions_valid5)
valid_accuracy6 = accuracy_score(y_valid6, predictions_valid6)
valid_accuracy7 = accuracy_score(y_valid7, predictions_valid7)
valid_accuracy8 = accuracy_score(y_valid8, predictions_valid8)
valid_accuracy9 = accuracy_score(y_valid9, predictions_valid9)
valid_accuracy10 = accuracy_score(y_valid10, predictions_valid10)
valid_accuracy = (valid_accuracy1 + valid_accuracy2 + valid_accuracy3 + valid_accuracy4 + valid_accuracy5 + valid_accuracy6 + valid_accuracy7 + valid_accuracy8 + valid_accuracy9 + valid_accuracy10)/10
print(f"Avg Valid Accuracy: {valid_accuracy}")

# Predicting on the test set
predictions_test1 = rf_classifier1.predict(X_test)
predictions_test2 = rf_classifier2.predict(X_test)
predictions_test3 = rf_classifier3.predict(X_test)
predictions_test4 = rf_classifier4.predict(X_test)
predictions_test5 = rf_classifier5.predict(X_test)
predictions_test6 = rf_classifier6.predict(X_test)
predictions_test7 = rf_classifier7.predict(X_test)
predictions_test8 = rf_classifier8.predict(X_test)
predictions_test9 = rf_classifier9.predict(X_test)
predictions_test10 = rf_classifier10.predict(X_test)

test_accuracy1 = accuracy_score(y_test, predictions_test1)
test_accuracy2 = accuracy_score(y_test, predictions_test2)
test_accuracy3 = accuracy_score(y_test, predictions_test3)
test_accuracy4 = accuracy_score(y_test, predictions_test4)
test_accuracy5 = accuracy_score(y_test, predictions_test5)
test_accuracy6 = accuracy_score(y_test, predictions_test6)
test_accuracy7 = accuracy_score(y_test, predictions_test7)
test_accuracy8 = accuracy_score(y_test, predictions_test8)
test_accuracy9 = accuracy_score(y_test, predictions_test9)
test_accuracy10 = accuracy_score(y_test, predictions_test10)
test_accuracy = (test_accuracy1 + test_accuracy2 + test_accuracy3 + test_accuracy4 + test_accuracy5 + test_accuracy6 + test_accuracy7 + test_accuracy8 + test_accuracy9 + test_accuracy10)/10
print(f"Avg Test Accuracy: {test_accuracy}")

pooled_predictions = []
for i in range(len(predictions_test1)):
  predicted_true = 0
  predicted_false = 0
  for prediction_list in [predictions_test1, predictions_test2, predictions_test3, predictions_test4, predictions_test5, predictions_test6, predictions_test7, predictions_test8, predictions_test9, predictions_test10]:
    if prediction_list[i] == 'T':
       predicted_true += 1
    else:
      predicted_false += 1
  if predicted_true > predicted_false:
    pooled_predictions.append('T')
  else:
    pooled_predictions.append('F')

pooled_test_accuracy = accuracy_score(y_test, pooled_predictions)
print(f"Test Accuracy With Pooling: {pooled_test_accuracy}")

print("Criterion:", best_criterion)
print("Min Samples To Split:", best_min_samples_split)
print("Max Features:", best_max_features)

Progress = 0.5 %
Progress = 1.75 %
Progress = 3.0 %
Progress = 4.25 %
Progress = 5.5 %
Progress = 6.75 %
Progress = 8.0 %
Progress = 9.25 %
Progress = 10.5 %
Progress = 11.75 %
Progress = 13.0 %
Progress = 14.25 %
Progress = 15.5 %
Progress = 16.75 %
Progress = 18.0 %
Progress = 19.25 %
Progress = 20.5 %
Progress = 21.75 %
Progress = 23.0 %
Progress = 24.25 %
Progress = 25.5 %
Progress = 26.75 %
Progress = 28.0 %
Progress = 29.25 %
Progress = 30.5 %
Progress = 31.75 %
Progress = 33.0 %
Progress = 34.25 %
