In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

#Importing NFL stats dataset
file_path = '/content/drive/My Drive/Sports Modeling/nfl/imputed_data.csv'
data = pd.read_csv(file_path)

In [4]:
# Get the count of null values in each column
null_counts = data.isnull().sum()

# Filter the columns with null values
null_counts = null_counts[null_counts > 0]

# Display the columns with their respective null value counts
print(null_counts)

Series([], dtype: int64)


In [5]:
# Create a new column 'result' with default value 0 (for draws)
data['result'] = 0

# Set the result to 1 where home team wins
data.loc[data['home_points'] > data['away_points'], 'result'] = 1

# Set the result to -1 where away team wins
data.loc[data['home_points'] < data['away_points'], 'result'] = -1

In [6]:
# Display columns that are not numerical
non_numerical_data = data.select_dtypes(exclude=['int64', 'float64'])

# Display the first few rows of these non-numerical columns
print(non_numerical_data.head())

             venue_name       venue_city venue_surface venue_roof_type  \
0  Three Rivers Stadium       Pittsburgh    artificial         outdoor   
1  RingCentral Coliseum          Oakland          turf         outdoor   
2          Georgia Dome          Atlanta    artificial            dome   
3        Giants Stadium  East Rutherford    artificial         outdoor   
4     Caesars Superdome      New Orleans    artificial            dome   

  home_name  away_name home_possession_time away_possession_time  
0  Steelers     Ravens                24:53                35:07  
1   Raiders   Chargers                31:30                28:30  
2   Falcons      49ers                31:39                28:21  
3    Giants  Cardinals                31:30                28:30  
4    Saints      Lions                29:11                30:49  


In [7]:
data = data.drop(['venue_name', 'venue_city'], axis=1)  # Dropping Unnecessary Columns

In [8]:
# Convert possession time from format 'MM:SS' to total seconds
data['home_possession_time_seconds'] = data['home_possession_time'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))
data['away_possession_time_seconds'] = data['away_possession_time'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))

In [9]:
# Drop the original time columns
data = data.drop(['home_possession_time', 'away_possession_time'], axis=1)

# Now check for non-numerical columns again
non_numerical_data = data.select_dtypes(exclude=['int64', 'float64'])
print(non_numerical_data.head())


  venue_surface venue_roof_type home_name  away_name
0    artificial         outdoor  Steelers     Ravens
1          turf         outdoor   Raiders   Chargers
2    artificial            dome   Falcons      49ers
3    artificial         outdoor    Giants  Cardinals
4    artificial            dome    Saints      Lions


In [10]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to each categorical column
data['venue_surface_encoded'] = label_encoder.fit_transform(data['venue_surface'])
data['venue_roof_type_encoded'] = label_encoder.fit_transform(data['venue_roof_type'])
data['home_name_encoded'] = label_encoder.fit_transform(data['home_name'])
data['away_name_encoded'] = label_encoder.fit_transform(data['away_name'])

# Drop the original columns if you no longer need them
data = data.drop(['venue_surface', 'venue_roof_type', 'home_name', 'away_name'], axis=1)

In [11]:
print(data.columns.tolist())

['attendance', 'quarter', 'season_year', 'week_sequence', 'week_title', 'venue_capacity', 'home_used_timeouts', 'home_remaining_timeouts', 'home_points', 'away_used_timeouts', 'away_remaining_timeouts', 'away_points', 'home_avg_gain', 'home_safeties', 'home_turnovers', 'home_play_count', 'home_rush_plays', 'home_total_yards', 'home_fumbles', 'home_lost_fumbles', 'home_penalties', 'home_penalty_yards', 'home_return_yards', 'home_rushing_totals_avg_yards', 'home_rushing_totals_attempts', 'home_rushing_totals_touchdowns', 'home_rushing_totals_tlost', 'home_rushing_totals_tlost_yards', 'home_rushing_totals_yards', 'home_rushing_totals_longest', 'home_rushing_totals_longest_touchdown', 'home_rushing_totals_redzone_attempts', 'home_receiving_totals_targets', 'home_receiving_totals_receptions', 'home_receiving_totals_avg_yards', 'home_receiving_totals_yards', 'home_receiving_totals_touchdowns', 'home_receiving_totals_yards_after_catch', 'home_receiving_totals_longest', 'home_receiving_totals_

### Finding most important features


> Recursive Feature Elimination (RFE) with a RandomForestClassifier



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Define the model
model = RandomForestClassifier(random_state=42)

# Define X by dropping the 'result' column
X = data.drop(columns=['result'])

# Define y as the 'result' column
y = data['result']

# Define RFE and select the top 10 features
rfe = RFE(estimator=model, n_features_to_select=50, step=1)
rfe = rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features)

# Optionally, check the ranking of all features
print("Feature Ranking:", rfe.ranking_)

Selected Features: Index(['home_used_timeouts', 'home_remaining_timeouts', 'home_points',
       'away_used_timeouts', 'away_remaining_timeouts', 'away_points',
       'home_avg_gain', 'home_rush_plays', 'home_total_yards',
       'home_rushing_totals_attempts', 'home_rushing_totals_yards',
       'home_receiving_totals_avg_yards', 'home_passing_totals_attempts',
       'home_passing_totals_cmp_pct', 'home_passing_totals_rating',
       'home_passing_totals_avg_yards', 'home_kickoffs_totals_yards',
       'home_kickoffs_totals_number', 'home_kick_returns_totals_yards',
       'home_field_goals_totals_yards', 'home_field_goals_totals_avg_yards',
       'home_extra_points_kicks_totals_attempts',
       'home_extra_points_kicks_totals_made', 'home_touchdowns_total',
       'away_avg_gain', 'away_rush_plays', 'away_total_yards',
       'away_return_yards', 'away_rushing_totals_attempts',
       'away_rushing_totals_yards', 'away_receiving_totals_avg_yards',
       'away_passing_totals_atte

In [13]:
feature_ranking = [18, 24, 100, 165, 177, 124, 1, 1, 1, 1, 1, 1, 1, 326, 35, 46, 1, 1, 187, 234, 173, 12, 19, 112, 1, 189,
                   208, 158, 1, 42, 79, 157, 27, 142, 1, 5, 83, 162, 76, 85, 207, 11, 204, 14, 68, 342, 284, 252, 98, 34,
                   55, 136, 117, 115, 105, 343, 292, 251, 223, 174, 9, 1, 119, 1, 160, 41, 1, 58, 1, 166, 99, 32, 72,
                   181, 77, 81, 301, 346, 363, 357, 362, 368, 322, 260, 274, 22, 168, 1, 310, 1, 2, 47, 1, 106, 340, 306,
                   359, 13, 182, 130, 293, 190, 198, 242, 268, 224, 226, 240, 307, 257, 360, 314, 361, 113, 31, 286, 1,
                   1, 40, 171, 183, 56, 96, 21, 145, 7, 120, 38, 236, 255, 80, 191, 156, 327, 169, 221, 299, 285, 305,
                   209, 344, 333, 279, 1, 350, 1, 200, 289, 311, 320, 348, 356, 365, 367, 161, 222, 123, 53, 111, 163,
                   126, 109, 176, 272, 1, 338, 302, 336, 329, 328, 214, 203, 225, 30, 15, 118, 159, 184, 45, 67, 270,
                   283, 10, 313, 1, 84, 1, 1, 212, 216, 179, 94, 1, 50, 1, 147, 205, 134, 1, 26, 74, 36, 33, 102, 1, 28,
                   172, 164, 89, 51, 202, 63, 210, 44, 39, 351, 280, 245, 167, 43, 57, 114, 65, 82, 133, 332, 317, 249,
                   213, 188, 17, 1, 110, 1, 107, 1, 1, 135, 1, 125, 92, 60, 75, 178, 1, 54, 303, 353, 364, 372, 369, 366,
                   312, 241, 254, 23, 193, 1, 300, 1, 66, 91, 1, 97, 354, 345, 323, 52, 170, 73, 315, 185, 186, 264, 256,
                   232, 247, 229, 308, 253, 319, 309, 347, 103, 4, 334, 1, 29, 49, 61, 215, 59, 86, 20, 155, 8, 108, 1,
                   263, 250, 140, 197, 122, 349, 150, 238, 324, 294, 296, 175, 325, 341, 275, 1, 330, 1, 269, 282, 297,
                   304, 318, 358, 370, 371, 129, 230, 104, 62, 78, 154, 152, 148, 153, 262, 1, 295, 291, 355, 335, 337,
                   199, 196, 237, 70, 25, 132, 180, 137, 1, 69, 273, 287, 121, 139, 90, 276, 206, 321, 339, 95, 271, 88,
                   144, 217, 141, 201, 131, 211, 352, 331, 3, 93, 235, 151, 195, 128, 259, 1, 261, 37, 281, 231, 228,
                   194, 266, 218, 220, 288, 138, 239, 233, 278, 1, 244, 64, 277, 243, 265, 192, 267, 227, 219, 298, 127,
                   248, 246, 101, 16, 48, 6, 87, 71, 116, 258, 143, 1, 1, 316, 290, 149, 146]

### Seleccting best features for modeling

In [14]:
# Define X by dropping the 'result' column
X = data.drop(columns=['result'])

# Define y as the 'result' column
y = data['result']

# Step 1: Create a DataFrame with features and their rankings
feature_rankings = pd.DataFrame({
    'Feature': X.columns,  # The feature names
    'Ranking': feature_ranking  # The rankings from RFE
})

# Step 2: Sort the features by their ranking
feature_rankings = feature_rankings.sort_values(by='Ranking')

# Step 3: Select features with a ranking less than or equal to 5
important_features = feature_rankings[feature_rankings['Ranking'] <= 5]['Feature']
important_features_list = important_features.tolist()

print("Selected Important Features with Ranking <= 5:\n", important_features_list)

Selected Important Features with Ranking <= 5:
 ['home_receiving_totals_avg_yards', 'away_rushing_totals_kneel_downs', 'home_passing_totals_cmp_pct', 'home_passing_totals_rating', 'away_possession_time_seconds', 'home_extra_points_kicks_totals_made', 'away_rushing_totals_yards', 'away_receiving_totals_avg_yards', 'away_passing_totals_avg_yards', 'home_avg_gain', 'home_rush_plays', 'home_total_yards', 'away_passing_totals_rating', 'home_extra_points_kicks_totals_attempts', 'away_kickoffs_totals_yards', 'away_passing_totals_net_yards', 'away_passing_totals_cmp_pct', 'away_passing_totals_sack_yards', 'home_passing_totals_avg_yards', 'home_rushing_totals_kneel_downs', 'away_touchdowns_total', 'away_efficiency_thirddown_pct', 'away_passing_totals_attempts', 'home_touchdowns_total', 'home_kickoffs_totals_number', 'home_field_goals_totals_yards', 'home_field_goals_totals_avg_yards', 'home_kickoffs_totals_yards', 'home_points', 'home_remaining_timeouts', 'home_used_timeouts', 'away_rushing_tot

In [15]:
important_features_new = ['home_touchdowns_total', 'away_passing_totals_net_yards', 'away_efficiency_thirddown_pct', 'away_field_goals_totals_yards', 'home_receiving_totals_avg_yards',
                       'home_field_goals_totals_yards', 'home_field_goals_totals_avg_yards', 'away_receiving_totals_avg_yards', 'away_kick_returns_totals_yards',
                       'away_rushing_totals_kneel_downs', 'home_passing_totals_avg_yards', 'away_kickoffs_totals_number', 'home_kickoffs_totals_yards', 'home_passing_totals_rating',
                       'home_rushing_totals_kneel_downs', 'away_return_yards', 'home_passing_totals_cmp_pct', 'home_passing_totals_attempts', 'away_passing_totals_attempts',
                       'away_passing_totals_cmp_pct', 'away_passing_totals_sack_yards', 'away_passing_totals_rating', 'home_extra_points_kicks_totals_made',
                       'away_passing_totals_avg_yards', 'home_extra_points_kicks_totals_attempts', 'away_kickoffs_totals_yards', 'home_rushing_totals_yards', 'away_touchdowns_total',
                       'home_kickoffs_totals_number', 'away_extra_points_kicks_totals_made', 'away_rushing_totals_attempts', 'away_possession_time_seconds',
                       'home_possession_time_seconds', 'away_rushing_totals_yards', 'home_used_timeouts', 'home_remaining_timeouts', 'home_points', 'away_used_timeouts',
                       'away_remaining_timeouts', 'away_points', 'home_avg_gain', 'away_total_yards', 'away_defense_totals_passes_defended', 'away_extra_points_kicks_totals_attempts',
                       'home_total_yards', 'away_rush_plays', 'away_turnovers', 'home_kick_returns_totals_yards', 'home_rush_plays', 'home_rushing_totals_attempts',
                       'home_kickoffs_totals_total_endzone', 'home_defense_totals_def_targets', 'away_field_goals_totals_made', 'home_receiving_totals_yards']

all_features_new = ['home_touchdowns_total', 'away_passing_totals_net_yards', 'away_efficiency_thirddown_pct', 'away_field_goals_totals_yards', 'home_receiving_totals_avg_yards',
                       'home_field_goals_totals_yards', 'home_field_goals_totals_avg_yards', 'away_receiving_totals_avg_yards', 'away_kick_returns_totals_yards',
                       'away_rushing_totals_kneel_downs', 'home_passing_totals_avg_yards', 'away_kickoffs_totals_number', 'home_kickoffs_totals_yards', 'home_passing_totals_rating',
                       'home_rushing_totals_kneel_downs', 'away_return_yards', 'home_passing_totals_cmp_pct', 'home_passing_totals_attempts', 'away_passing_totals_attempts',
                       'away_passing_totals_cmp_pct', 'away_passing_totals_sack_yards', 'away_passing_totals_rating', 'home_extra_points_kicks_totals_made',
                       'away_passing_totals_avg_yards', 'home_extra_points_kicks_totals_attempts', 'away_kickoffs_totals_yards', 'home_rushing_totals_yards', 'away_touchdowns_total',
                       'home_kickoffs_totals_number', 'away_extra_points_kicks_totals_made', 'away_rushing_totals_attempts', 'away_possession_time_seconds',
                       'home_possession_time_seconds', 'away_rushing_totals_yards', 'home_used_timeouts', 'home_remaining_timeouts', 'home_points', 'away_used_timeouts',
                       'away_remaining_timeouts', 'away_points', 'home_avg_gain', 'away_total_yards', 'away_defense_totals_passes_defended', 'away_extra_points_kicks_totals_attempts',
                       'home_total_yards', 'away_rush_plays', 'away_turnovers', 'home_kick_returns_totals_yards', 'home_rush_plays', 'home_rushing_totals_attempts',
                       'home_kickoffs_totals_total_endzone', 'home_defense_totals_def_targets', 'away_field_goals_totals_made', 'home_receiving_totals_yards',
                       'away_field_goals_totals_avg_yards', 'away_kickoffs_totals_total_endzone', 'away_defense_totals_def_targets', 'away_avg_gain', 'away_receiving_totals_yards',
                       'season_year', 'week_sequence', 'week_title', 'venue_capacity', 'venue_location_lat', 'venue_location_lng',
                       'home_name_encoded', 'away_name_encoded', 'venue_surface_encoded','venue_roof_type_encoded', 'month', 'day', 'result']




In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

# creatiing new dataframe with all new features
df_new = data[all_features_new]

# Sort the dataset by 'season_year', 'month', and 'day'
df_new_sorted = df_new.sort_values(by=['season_year', 'month', 'day'], ascending=[True, True, True])

# Split the data into training and validation sets (95% for training, 5% for testing)
train_val_data, test_data = train_test_split(df_new_sorted, test_size=0.01, shuffle=False)

# Further split the training and validation set into 70% training and 30% validation
train_data, val_data = train_test_split(train_val_data, test_size=0.2, shuffle=False)

# Step 4: Create the test dataset with only the before_game_features
Before_game_features = [
    'season_year', 'week_sequence', 'week_title', 'venue_capacity',
    'venue_location_lat', 'venue_location_lng', 'home_name_encoded', 'away_name_encoded',
    'venue_surface_encoded', 'venue_roof_type_encoded', 'month', 'day', 'result'
]

test_data = test_data[Before_game_features]

# Displaying the dataset shapes to the user for verification
print("Training Data Shape:", train_data.shape)
print("Validation Data Shape:", val_data.shape)
print("Testing Data Shape:", test_data.shape)

Training Data Shape: (4793, 72)
Validation Data Shape: (1199, 72)
Testing Data Shape: (61, 13)


In [17]:
test_data

Unnamed: 0,season_year,week_sequence,week_title,venue_capacity,venue_location_lat,venue_location_lng,home_name_encoded,away_name_encoded,venue_surface_encoded,venue_roof_type_encoded,month,day,result
5975,2023.0,14.0,14.0,73208.0,29.950928,-90.080876,28,22,0,0,12.0,10.0,1
5976,2023.0,14.0,14.0,71008.0,39.277995,-76.622592,26,25,1,1,12.0,10.0,1
5977,2023.0,14.0,14.0,65515.0,39.095413,-84.516204,2,10,0,1,12.0,10.0,1
5978,2023.0,14.0,14.0,67895.0,41.506054,-81.700004,5,18,1,1,12.0,10.0,1
5979,2023.0,14.0,14.0,61500.0,41.862498,-87.616979,1,20,1,1,12.0,10.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6031,2023.0,17.0,17.0,72220.0,29.684735,-95.410725,31,32,0,2,12.0,31.0,1
6032,2023.0,17.0,17.0,71008.0,39.277995,-76.622592,26,13,1,1,12.0,31.0,1
6033,2023.0,17.0,17.0,68740.0,47.595165,-122.331650,29,30,0,1,12.0,31.0,-1
6034,2023.0,17.0,17.0,76416.0,39.049019,-94.484140,9,2,1,1,12.0,31.0,1


In [18]:
# Extract only the float columns
non_integer_columns = df_new.select_dtypes(exclude=['int64'])

print(non_integer_columns.columns.tolist())

['home_touchdowns_total', 'away_passing_totals_net_yards', 'away_efficiency_thirddown_pct', 'away_field_goals_totals_yards', 'home_receiving_totals_avg_yards', 'home_field_goals_totals_yards', 'home_field_goals_totals_avg_yards', 'away_receiving_totals_avg_yards', 'away_kick_returns_totals_yards', 'away_rushing_totals_kneel_downs', 'home_passing_totals_avg_yards', 'away_kickoffs_totals_number', 'home_kickoffs_totals_yards', 'home_passing_totals_rating', 'home_rushing_totals_kneel_downs', 'away_return_yards', 'home_passing_totals_cmp_pct', 'home_passing_totals_attempts', 'away_passing_totals_attempts', 'away_passing_totals_cmp_pct', 'away_passing_totals_sack_yards', 'away_passing_totals_rating', 'home_extra_points_kicks_totals_made', 'away_passing_totals_avg_yards', 'home_extra_points_kicks_totals_attempts', 'away_kickoffs_totals_yards', 'home_rushing_totals_yards', 'away_touchdowns_total', 'home_kickoffs_totals_number', 'away_extra_points_kicks_totals_made', 'away_rushing_totals_attemp

In [19]:
# Save DataFrame as a CSV file
df_new.to_csv('df_new.csv', index=False)

In [20]:
import numpy as np
# X is the DataFrame containing all features
X = data[all_features_new]

# Check for infinite values in X
infinite_mask = np.isinf(X)

# Get the count of infinite values for each column
inf_counts = infinite_mask.sum(axis=0)

# Filter out columns with infinite values and get the counts
inf_columns_with_counts = inf_counts[inf_counts > 0]

# Print the results
print("Columns with infinite values and their counts in X:")
print(inf_columns_with_counts)

Columns with infinite values and their counts in X:
Series([], dtype: int64)


In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from keras.utils import to_categorical

# Features and target for training and validation
X_train = train_data[all_features_new].drop(columns=['result'])  # Drop the target column from features
y_train = to_categorical(train_data['result'] + 1, num_classes=3)  # Shift to make classes 0, 1, 2

X_val = val_data[all_features_new].drop(columns=['result'])  # Drop the target column from features
y_val = to_categorical(val_data['result'] + 1, num_classes=3)  # Shift to make classes 0, 1, 2

# Normalize the feature data
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Define focal loss function
def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        eps = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, eps, 1. - eps)
        y_true = tf.cast(y_true, tf.float32)
        alpha_t = y_true * alpha + (tf.keras.backend.ones_like(y_true) - y_true) * (1 - alpha)
        p_t = y_true * y_pred + (tf.keras.backend.ones_like(y_true) - y_true) * (tf.keras.backend.ones_like(y_true) - y_pred)
        fl = - alpha_t * tf.keras.backend.pow((tf.keras.backend.ones_like(y_true) - p_t), gamma) * tf.keras.backend.log(p_t)
        return tf.keras.backend.mean(fl)
    return focal_loss_fixed

# Build the model to avoid overfitting, adding dropout, batch normalization, and regularization
model = Sequential()

# First layer with L2 regularization, BatchNormalization, and Dropout
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.02), input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())  # Added BatchNormalization
model.add(Dropout(0.4))  # Increased dropout rate to 0.4

# Second layer
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.02)))
model.add(BatchNormalization())  # Added BatchNormalization
model.add(Dropout(0.4))  # Increased dropout rate to 0.4

# Third layer
model.add(Dense(16, activation='relu', kernel_regularizer=l2(0.02)))
model.add(BatchNormalization())  # Added BatchNormalization
model.add(Dropout(0.4))  # Increased dropout rate to 0.4

# Output layer with softmax for multi-class classification
model.add(Dense(3, activation='softmax'))

# Compile the model with focal loss
model.compile(optimizer='adam', loss=focal_loss(gamma=2., alpha=.25), metrics=['accuracy'])

# Calculate class weights for imbalanced dataset
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(np.argmax(y_train, axis=1)), y=np.argmax(y_train, axis=1))
class_weights = dict(enumerate(class_weights))

# Early stopping with patience reduced to 5 to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with reduced epochs and class weights
model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping], class_weight=class_weights)

# Evaluate the model on the validation set
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy:.2f}')

# Generate confusion matrix and classification report
y_pred = model.predict(X_val)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_val, axis=1)

print("Confusion Matrix:")
print(confusion_matrix(y_true_classes, y_pred_classes))

print("\nClassification Report:")
print(classification_report(y_true_classes, y_pred_classes))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 31ms/step - accuracy: 0.4301 - loss: 2.3142 - val_accuracy: 0.8999 - val_loss: 1.1317
Epoch 2/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6922 - loss: 0.9578 - val_accuracy: 0.8999 - val_loss: 0.4750
Epoch 3/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8018 - loss: 0.3937 - val_accuracy: 0.8724 - val_loss: 0.1984
Epoch 4/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8718 - loss: 0.1682 - val_accuracy: 0.9341 - val_loss: 0.0890
Epoch 5/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8904 - loss: 0.0887 - val_accuracy: 0.9366 - val_loss: 0.0562
Epoch 6/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9085 - loss: 0.0579 - val_accuracy: 0.9324 - val_loss: 0.0339
Epoch 7/15
[1m150/150[0m

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
X_train.shape
y_train.shape
column_count = X.shape[1]
print(f'Column count: {column_count}')

Column count: 72


### Function to Retrieve Historical Data


In [23]:
import pandas as pd
import numpy as np

# List of columns that should remain float
float_columns = [
    'away_efficiency_thirddown_pct', 'home_receiving_totals_avg_yards', 'home_field_goals_totals_avg_yards',
    'away_receiving_totals_avg_yards', 'home_passing_totals_avg_yards', 'home_passing_totals_rating',
    'home_passing_totals_cmp_pct', 'away_passing_totals_cmp_pct', 'away_passing_totals_rating',
    'away_passing_totals_avg_yards', 'away_kickoffs_totals_yards', 'home_avg_gain',
    'home_defense_totals_def_targets', 'away_field_goals_totals_avg_yards', 'away_defense_totals_def_targets',
    'away_avg_gain', 'venue_location_lat', 'venue_location_lng', 'away_rushing_totals_kneel_downs'
]

def retrieve_historical_data(home_team, away_team, game_date, historical_df, all_features_new, lookback_games=6):
    # Unpack the game_date tuple
    game_year, game_month, game_day = game_date

    # Filter the dataframe to include only important features plus date and team identifiers
    important_features = [f for f in all_features_new if f not in Before_game_features]
    relevant_columns = important_features + ['home_name_encoded', 'away_name_encoded', 'season_year', 'month', 'day']

    # Select only relevant columns
    historical_df = historical_df[relevant_columns]

    # Drop rows with NaN values
    historical_df.dropna(inplace=True)

    # First, sort the data by season year, month, and day in descending order
    historical_df = historical_df.sort_values(by=['season_year', 'month', 'day'], ascending=False)

    # Now filter for home-away combinations (either home vs away or away vs home)
    past_games_combined = historical_df[
        (
            ((historical_df['home_name_encoded'] == home_team) & (historical_df['away_name_encoded'] == away_team)) |
            ((historical_df['home_name_encoded'] == away_team) & (historical_df['away_name_encoded'] == home_team))
        )
    ]

    # Further filter to ensure that the games occurred before the specific game date
    past_games_combined = past_games_combined[
        (
            (past_games_combined['season_year'] < game_year) |  # All games from previous years
            (
                (past_games_combined['season_year'] == game_year) &  # Same year, but before the specific game date
                (
                    (past_games_combined['month'] < game_month) |
                    ((past_games_combined['month'] == game_month) & (past_games_combined['day'] < game_day))
                )
            )
        )
    ]

    # Select only the most recent `lookback_games` number of games
    past_games_combined = past_games_combined.head(lookback_games)

    # Calculate historical features using the mean for each of the important features
    historical_features = {}

    for feature in important_features:
        if feature in past_games_combined.columns:
            feature_mean = past_games_combined[feature].mean()
            if feature not in float_columns:
                feature_mean = np.round(feature_mean, 0)  # Round to nearest integer if not a float column
            historical_features[feature] = feature_mean

    return historical_features

In [24]:
# Creating an empty list to store the results
retrieved_data = []

# Iterate over each row in test_data
for index, row in test_data.iterrows():
    # Extract the relevant features for the prediction
    before_game_features = {
        'season_year': row['season_year'],
        'week_sequence': row['week_sequence'],
        'week_title': row['week_title'],
        'venue_capacity': row['venue_capacity'],
        'venue_location_lat': row['venue_location_lat'],
        'venue_location_lng': row['venue_location_lng'],
        'home_name_encoded': row['home_name_encoded'],
        'away_name_encoded': row['away_name_encoded'],
        'venue_surface_encoded': row['venue_surface_encoded'],
        'venue_roof_type_encoded': row['venue_roof_type_encoded'],
        'month': row['month'],
        'day': row['day']
    }

    # Construct the game_date from year, month, and day
    game_date = (row['season_year'], row['month'], row['day'])

    # Print the extracted before_game_features to verify the correct data
    #print(f"Before game features for index {index}: {before_game_features}")

    # Retrieve historical data using the retrieve_historical_data function
    historical_features = retrieve_historical_data(
        home_team=row['home_name_encoded'],
        away_team=row['away_name_encoded'],
        game_date=game_date,
        historical_df=data,  # Assuming you have your historical data loaded into historical_df
        all_features_new=all_features_new,  # Assuming this is the list of features you're interested in
        lookback_games=5  # Modify the number of games you want to look back
    )
    # Combine the before_game_features with the historical features
    combined_data = {**before_game_features, **historical_features}
    retrieved_data.append(combined_data)

# Convert the retrieved data into a DataFrame for display
retrieved_df = pd.DataFrame(retrieved_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set 

In [25]:
retrieved_df

Unnamed: 0,season_year,week_sequence,week_title,venue_capacity,venue_location_lat,venue_location_lng,home_name_encoded,away_name_encoded,venue_surface_encoded,venue_roof_type_encoded,...,home_rushing_totals_attempts,home_kickoffs_totals_total_endzone,home_defense_totals_def_targets,away_field_goals_totals_made,home_receiving_totals_yards,away_field_goals_totals_avg_yards,away_kickoffs_totals_total_endzone,away_defense_totals_def_targets,away_avg_gain,away_receiving_totals_yards
0,2023.0,14.0,14.0,73208.0,29.950928,-90.080876,28.0,22.0,0.0,0.0,...,29.0,3.0,30.4,1.0,204.0,16.7000,3.0,30.2,4.14,175.0
1,2023.0,14.0,14.0,71008.0,39.277995,-76.622592,26.0,25.0,1.0,1.0,...,25.0,3.0,34.6,1.0,212.0,13.6000,4.0,33.6,5.64,250.0
2,2023.0,14.0,14.0,65515.0,39.095413,-84.516204,2.0,10.0,0.0,1.0,...,25.0,4.0,36.0,1.0,310.0,20.9666,4.0,39.0,5.20,248.0
3,2023.0,14.0,14.0,67895.0,41.506054,-81.700004,5.0,18.0,1.0,1.0,...,28.0,3.0,34.4,2.0,232.0,30.6834,3.0,32.8,4.14,205.0
4,2023.0,14.0,14.0,61500.0,41.862498,-87.616979,1.0,20.0,1.0,1.0,...,31.0,4.0,29.6,2.0,208.0,22.7000,4.0,26.2,5.10,219.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,2023.0,17.0,17.0,72220.0,29.684735,-95.410725,31.0,32.0,0.0,2.0,...,25.0,2.0,26.4,2.0,215.0,22.3834,2.0,32.4,4.90,179.0
57,2023.0,17.0,17.0,71008.0,39.277995,-76.622592,26.0,13.0,1.0,1.0,...,24.0,6.0,42.6,0.0,266.0,11.4000,4.0,34.6,5.76,298.0
58,2023.0,17.0,17.0,68740.0,47.595165,-122.331650,29.0,30.0,0.0,1.0,...,30.0,4.0,37.8,1.0,256.0,14.3666,3.0,32.4,4.90,256.0
59,2023.0,17.0,17.0,76416.0,39.049019,-94.484140,9.0,2.0,1.0,1.0,...,27.0,3.0,32.4,2.0,320.0,23.6286,4.0,32.2,6.04,257.0


In [26]:
# Save DataFrame as a CSV file
retrieved_df.to_csv('retrieved_game_data.csv', index=False)

In [27]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

main_df_columns = [
    'home_touchdowns_total', 'away_passing_totals_net_yards', 'away_efficiency_thirddown_pct',
    'away_field_goals_totals_yards', 'home_receiving_totals_avg_yards', 'home_field_goals_totals_yards',
    'home_field_goals_totals_avg_yards', 'away_receiving_totals_avg_yards', 'away_kick_returns_totals_yards',
    'away_rushing_totals_kneel_downs', 'home_passing_totals_avg_yards', 'away_kickoffs_totals_number',
    'home_kickoffs_totals_yards', 'home_passing_totals_rating', 'home_rushing_totals_kneel_downs',
    'away_return_yards', 'home_passing_totals_cmp_pct', 'home_passing_totals_attempts', 'away_passing_totals_attempts',
    'away_passing_totals_cmp_pct', 'away_passing_totals_sack_yards', 'away_passing_totals_rating',
    'home_extra_points_kicks_totals_made', 'away_passing_totals_avg_yards', 'home_extra_points_kicks_totals_attempts',
    'away_kickoffs_totals_yards', 'home_rushing_totals_yards', 'away_touchdowns_total', 'home_kickoffs_totals_number',
    'away_extra_points_kicks_totals_made', 'away_rushing_totals_attempts', 'away_possession_time_seconds',
    'home_possession_time_seconds', 'away_rushing_totals_yards', 'home_used_timeouts', 'home_remaining_timeouts',
    'home_points', 'away_used_timeouts', 'away_remaining_timeouts', 'away_points', 'home_avg_gain', 'away_total_yards',
    'away_defense_totals_passes_defended', 'away_extra_points_kicks_totals_attempts', 'home_total_yards',
    'away_rush_plays', 'away_turnovers', 'home_kick_returns_totals_yards', 'home_rush_plays', 'home_rushing_totals_attempts',
    'home_kickoffs_totals_total_endzone', 'home_defense_totals_def_targets', 'away_field_goals_totals_made',
    'home_receiving_totals_yards', 'away_field_goals_totals_avg_yards', 'away_kickoffs_totals_total_endzone',
    'away_defense_totals_def_targets', 'away_avg_gain', 'away_receiving_totals_yards', 'season_year', 'week_sequence',
    'week_title', 'venue_capacity', 'venue_location_lat', 'venue_location_lng', 'home_name_encoded', 'away_name_encoded',
    'venue_surface_encoded', 'venue_roof_type_encoded', 'month', 'day'
]

def create_feature_vector(home_team, away_team, before_game_features, game_date, historical_df, all_features_new, scaler):
    # Ensure historical_df is not modified in place (avoid SettingWithCopyWarning)
    historical_df = historical_df.copy()

    # Drop missing values (if required)
    historical_df.dropna(inplace=True)

    # Combine before-game features with historical features
    historical_features = retrieve_historical_data(home_team, away_team, game_date, historical_df, all_features_new)
    feature_vector = {**historical_features, **before_game_features}

    # Convert to DataFrame for model input
    feature_vector_df = pd.DataFrame([feature_vector])

    # Reorder the columns to match the required order (main_df_columns)
    feature_vector_df = feature_vector_df.reindex(columns=main_df_columns)

    # Ensure no missing columns in the feature vector
    missing_cols = set(main_df_columns) - set(feature_vector_df.columns)
    if missing_cols:
        raise ValueError(f"Missing columns in feature vector: {missing_cols}")

    # Scale the features using the provided scaler
    feature_vector_scaled = pd.DataFrame(scaler.transform(feature_vector_df), columns=feature_vector_df.columns)

    # Return the scaled feature vector
    return feature_vector_scaled


In [28]:
import pandas as pd
from datetime import datetime

# Assuming historical_df, scaler, important_features, and main_df_columns are already defined and available

# List to store the scaled feature vectors
scaled_test_vectors = []

# Iterate over each row in test_data
for index, row in test_data.iterrows():
    # Extract the relevant features for the prediction
    before_game_features = {
        'season_year': row['season_year'],
        'week_sequence': row['week_sequence'],
        'week_title': row['week_title'],
        'venue_capacity': row['venue_capacity'],
        'venue_location_lat': row['venue_location_lat'],
        'venue_location_lng': row['venue_location_lng'],
        'home_name_encoded': row['home_name_encoded'],
        'away_name_encoded': row['away_name_encoded'],
        'venue_surface_encoded': row['venue_surface_encoded'],
        'venue_roof_type_encoded': row['venue_roof_type_encoded'],
        'month': row['month'],
        'day': row['day']
    }

    # Construct the game_date from year, month, and day
    game_date = (row['season_year'], row['month'], row['day'])

    # Call the function to create and scale the feature vector
    feature_vector_scaled = create_feature_vector(
        row['home_name_encoded'],
        row['away_name_encoded'],
        before_game_features,
        game_date,
        data,  # This is your historical data DataFrame
        important_features_new,  # Pass the list of important features here
        scaler,  # Assuming you have a fitted scaler
    )

    # Append the scaled feature vector to the list
    scaled_test_vectors.append(feature_vector_scaled)

# Convert list of scaled vectors to a DataFrame
scaled_test_vectors_df = pd.concat(scaled_test_vectors, ignore_index=True)

# Convert the scaled test DataFrame to NumPy array
scaled_test_array = scaled_test_vectors_df.to_numpy()

# Print test data as a NumPy array (similar format to X_train)
print("Test Data (scaled):")
print(scaled_test_array[:5])  # Print the first 5 rows for comparison

# You can compare it to the scaled training data as follows
print("Train Data (scaled):")
print(X_train[:5])  # Assuming X_train is already scaled

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set 

Test Data (scaled):
[[ 0.00000000e+00 -4.40366972e-01  3.61460589e-01 -3.15789474e-01
  -8.65685661e-02  7.28813559e-01  2.01383333e-01 -2.54358655e-01
  -2.38805970e-01  6.66666667e-01 -2.66025641e-02 -6.66666667e-01
  -5.84795322e-02  2.60614273e-01  1.00000000e+00 -4.32432432e-01
   1.93271605e-01 -1.66666667e-01 -4.54545455e-01 -1.54251674e-01
   5.29411765e-01 -5.77777778e-01 -5.00000000e-01 -3.86025641e-01
   0.00000000e+00 -3.71086556e-01  4.47761194e-02  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00 -2.22811671e-01
   1.62601626e-01 -1.34328358e-01  0.00000000e+00  0.00000000e+00
  -2.14285714e-01  5.00000000e-01 -5.00000000e-01 -4.28571429e-01
  -1.22222222e-01 -5.58333333e-01 -6.66666667e-01  0.00000000e+00
  -1.92982456e-01  0.00000000e+00  1.00000000e+00 -6.06060606e-01
   9.09090909e-02  9.09090909e-02  5.00000000e-01 -1.96969697e-01
   0.00000000e+00 -1.45631068e-01             nan             nan
              nan             nan             nan  1.400

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set 

In [29]:
def predict_game_outcome(home_team, away_team, before_game_features, game_date, historical_df, important_features, model, scaler):
    # Create the feature vector
    feature_vector_df = create_feature_vector(home_team, away_team, before_game_features, game_date, historical_df, important_features, scaler)

    # Make the prediction
    prediction = model.predict(feature_vector_df)

    # Get the index of the highest probability class
    predicted_class = np.argmax(prediction, axis=1)[0]  # Extract the class index (0, 1, or 2)

    # Map the predicted class index to the corresponding label
    prediction_label = "Home Win" if predicted_class == 2 else "Away Win" if predicted_class == 0 else "Draw"
    return prediction_label

In [30]:

import pandas as pd
from datetime import datetime

# Assuming historical_df, model, scaler, and important_features are already defined and available

# List to store the predictions
predictions = []

# Iterate over each row in X_test
for index, row in test_data.iterrows():
    # Extract the relevant features for the prediction
    before_game_features = {
        'season_year': row['season_year'],
        'week_sequence': row['week_sequence'],
        'week_title': row['week_title'],
        'venue_capacity': row['venue_capacity'],
        'venue_location_lat': row['venue_location_lat'],
        'venue_location_lng': row['venue_location_lng'],
        'home_name_encoded': row['home_name_encoded'],
        'away_name_encoded': row['away_name_encoded'],
        'venue_surface_encoded': row['venue_surface_encoded'],
        'venue_roof_type_encoded': row['venue_roof_type_encoded'],
        'month': row['month'],
        'day': row['day']
    }

    # Construct the game_date from year, month, and day
    game_date = (row['season_year'], row['month'], row['day'])

    # Predict the game outcome using the extracted features
    prediction_label = predict_game_outcome(
        home_team = row['home_name_encoded'],
        away_team = row['away_name_encoded'],
        before_game_features = before_game_features,
        game_date = game_date,
        historical_df = data,  # This should be your historical data DataFrame
        important_features = all_features_new,  # Pass the list of important features here
        model = model,
        scaler=scaler,  # Assuming you have a scaler, pass None if no scaling is needed
    )

    # Append the prediction to the list
    predictions.append(prediction_label)

# Add the predictions to X_test DataFrame for comparison
test_data['predicted_outcome'] = predictions

# Optionally, print the DataFrame to see the predictions
print(test_data[['home_name_encoded', 'away_name_encoded', 'predicted_outcome', 'result']])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
      home_name_encoded  away_name_encoded predicted_outcome  result
5975                 28                 22          Home Win       1
5976                 26                 25          Away Win       1
5977                  2                 10          Home Win       1
5978                  5                 18          Home Win       1
5979                  1                 20          Home Win       1
...                 ...                ...               ...     ...
6031                 31                 32          Away Win       1
6032                 26                 13          Home Win       1
6033                 29                 30          Home Win      -1
6034                  9                  2          Home Win       1
6035                  4                  8          Home Win       1

[61 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_df.dropna(inplace=True)


In [31]:
from sklearn.metrics import accuracy_score

# Step 1: Adjust the 'result' column to match the labels
# Convert -1 to 0, 0 stays the same, and 1 to 2
test_data['result'] = test_data['result'].map({-1: 0, 0: 1, 1: 2})

# Step 2: Print the DataFrame to verify the conversion and predictions
print(test_data[['home_name_encoded', 'away_name_encoded', 'predicted_outcome', 'result']])

# Step 3: Calculate the accuracy score
# Convert 'predicted_outcome' from labels ("Home Win", "Away Win", "Draw") to numerical values (2, 0, 1)
test_data['predicted_outcome_num'] = test_data['predicted_outcome'].map({"Home Win": 2, "Away Win": 0, "Draw": 1})

# Calculate accuracy
accuracy = accuracy_score(test_data['result'], test_data['predicted_outcome_num'])

print(f"Accuracy: {accuracy:.2f}")


      home_name_encoded  away_name_encoded predicted_outcome  result
5975                 28                 22          Home Win       2
5976                 26                 25          Away Win       2
5977                  2                 10          Home Win       2
5978                  5                 18          Home Win       2
5979                  1                 20          Home Win       2
...                 ...                ...               ...     ...
6031                 31                 32          Away Win       2
6032                 26                 13          Home Win       2
6033                 29                 30          Home Win       0
6034                  9                  2          Home Win       2
6035                  4                  8          Home Win       2

[61 rows x 4 columns]
Accuracy: 0.59


In [32]:
print(test_data['predicted_outcome_num'].value_counts())


predicted_outcome_num
2    45
0    16
Name: count, dtype: int64
