In [1]:
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
data = pd.read_csv("final_opponent_and_team_data.csv")
data.head()

Unnamed: 0,Date,Team,Opponent,Location,WAB,ADJO,ADJD,EFF,EFG%,TO%,OR%,FTR,2P,3P,Opp EFF,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,Opp 2P,Opp 3P,Opponent_score,Team_score,opp_adj_o,opp_adj_d
0,2024-11-04,Duke,Maine,H,0.1,125.3,95.2,130.6,64.3,17.7,35.5,31.7,24-34,11-29,84.3,39.5,19.0,19.4,31.6,18-43,3-14,62,96,108.9,110.8
1,2024-11-08,Duke,Army,H,0.1,124.7,92.3,141.0,61.3,11.3,43.6,25.4,18-33,17-38,81.8,39.1,18.3,23.3,14.1,13-35,8-29,58,100,107.2,119.0
2,2024-11-12,Duke,Kentucky,N,-0.2,106.8,86.4,95.7,42.3,9.3,25.0,23.9,24-47,4-24,102.3,47.6,14.6,23.3,38.1,15-38,10-25,77,72,119.6,88.1
3,2024-11-16,Duke,Wofford,H,-0.1,124.7,56.9,133.7,61.3,15.5,45.2,17.7,14-24,16-38,54.4,28.9,29.5,29.3,5.3,9-24,5-33,35,86,69.8,115.5
4,2024-11-22,Duke,Arizona,A,0.6,111.7,75.9,101.9,50.0,20.7,35.1,21.3,17-36,9-25,81.2,45.3,22.2,16.7,20.8,15-30,6-23,55,69,98.9,90.4


In [3]:
data.dtypes

Date               object
Team               object
Opponent           object
Location           object
WAB               float64
ADJO              float64
ADJD              float64
EFF               float64
EFG%              float64
TO%               float64
OR%               float64
FTR               float64
2P                 object
3P                 object
Opp EFF           float64
Opp EFG%          float64
Opp TO%           float64
Opp OR%           float64
Opp FTR           float64
Opp 2P             object
Opp 3P             object
Opponent_score      int64
Team_score          int64
opp_adj_o         float64
opp_adj_d         float64
dtype: object

In [4]:
data.isnull().sum()

Date              0
Team              0
Opponent          0
Location          0
WAB               0
ADJO              0
ADJD              0
EFF               0
EFG%              0
TO%               0
OR%               0
FTR               0
2P                0
3P                0
Opp EFF           0
Opp EFG%          0
Opp TO%           0
Opp OR%           0
Opp FTR           0
Opp 2P            0
Opp 3P            0
Opponent_score    0
Team_score        0
opp_adj_o         0
opp_adj_d         0
dtype: int64

In [5]:
data[data.isnull().any(axis=1)]

Unnamed: 0,Date,Team,Opponent,Location,WAB,ADJO,ADJD,EFF,EFG%,TO%,OR%,FTR,2P,3P,Opp EFF,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,Opp 2P,Opp 3P,Opponent_score,Team_score,opp_adj_o,opp_adj_d


In [6]:
data = data.dropna()

In [7]:
def preprocess(data):
    data_clean = data.dropna()
    columns_to_convert = [
        "ADJO",
        "ADJD",
        "EFG%",
        "TO%",
        "OR%",
        "FTR",
        "Opp EFG%",
        "Opp TO%",
        "Opp OR%",
        "Opp FTR",
    ]
    for col in columns_to_convert:
        data[col] = pd.to_numeric(data[col], errors="coerce")

    ratio_columns = ["2P", "3P", "Opp 2P", "Opp 3P"]

    # Convert existing ratio columns to numerical percentages
    for column in ratio_columns:
        if column in data.columns:  # Check if the column exists in the dataset
            data[column] = (
                data[column]
                .str.split("-")
                .apply(
                    lambda x: (
                        int(x[0]) / int(x[1]) if len(x) == 2 and int(x[1]) != 0 else 0
                    )
                )
            )
    categorical_columns = ["Location"]
    data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

    data_encoded = data_encoded.apply(
        lambda x: x.astype(int) if x.dtype == "bool" else x
    )

    return data_encoded

In [8]:
preprocessed_data = preprocess(data)

In [9]:
# numeric_columns = ["Team_score", "Opponent_score"]

# # Initialize scaler
# scaler = StandardScaler()

# # Scale numeric columns
# preprocessed_data[numeric_columns] = scaler.fit_transform(preprocessed_data[numeric_columns])

In [10]:
X = preprocessed_data.drop(
    columns=[
        "Team_score",
        "Opponent_score",
        "Team",
        "Opponent",
        "2P",
        "3P",
        "Opp 2P",
        "Opp 3P",
        "WAB",
        "EFF",
        "Opp EFF",
        "Date",
    ]
)
y = preprocessed_data[["Team_score", "Opponent_score"]]

X = X.fillna(0)

In [11]:
X.dtypes

ADJO          float64
ADJD          float64
EFG%          float64
TO%           float64
OR%           float64
FTR           float64
Opp EFG%      float64
Opp TO%       float64
Opp OR%       float64
Opp FTR       float64
opp_adj_o     float64
opp_adj_d     float64
Location_H      int64
Location_N      int64
dtype: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)
y_train = scaler_y.fit_transform(y_train)
y_test = scaler_y.transform(y_test)

In [14]:
model = MLPRegressor(
    hidden_layer_sizes=(32, 16), activation="relu", max_iter=500, random_state=42
)

In [15]:
model.fit(X_train, y_train)



In [16]:
y_pred_scaled = model.predict(X_test)

In [17]:
y_pred = scaler_y.inverse_transform(y_pred_scaled)

In [18]:
y_pred.shape

(61, 2)

In [19]:
y_test.shape  # Convert y_pred to a numpy array if it's not already

(61, 2)

In [20]:
# Ensure that y_test is 2D
y_test = np.array(y_test)

# Check if y_test and y_pred have matching shapes
print(f"Shape of y_test: {y_test.shape}")
print(f"Shape of y_pred: {y_pred.shape}")

# If y_test is 2D (it should be with shape (61, 2)), proceed with the metrics
if y_test.ndim == 2 and y_pred.ndim == 2:
    mae_team = mean_absolute_error(y_test[:, 0], y_pred[:, 0])
    mae_opponent = mean_absolute_error(y_test[:, 1], y_pred[:, 1])

    mse_team = mean_squared_error(y_test[:, 0], y_pred[:, 0])
    mse_opponent = mean_squared_error(y_test[:, 1], y_pred[:, 1])

    r2_team = r2_score(y_test[:, 0], y_pred[:, 0])
    r2_opponent = r2_score(y_test[:, 1], y_pred[:, 1])

    print("Team Score Prediction Metrics:")
    print(f"MAE: {mae_team}")
    print(f"MSE: {mse_team}")
    print(f"R² Score: {r2_team}")

    print("\nOpponent Score Prediction Metrics:")
    print(f"MAE: {mae_opponent}")
    print(f"MSE: {mse_opponent}")
    print(f"R² Score: {r2_opponent}")
else:
    print("The shapes of y_test and y_pred do not match.")

Shape of y_test: (61, 2)
Shape of y_pred: (61, 2)
Team Score Prediction Metrics:
MAE: 78.97570822060536
MSE: 6377.666389291225
R² Score: -6833.628915042071

Opponent Score Prediction Metrics:
MAE: 67.47908325493877
MSE: 4677.363016395422
R² Score: -6199.785725916307


### Predicting

In [21]:
predict_data = pd.read_csv("basketball_games_data.csv")
predict_data

Unnamed: 0,Location,Team,Opponent,ADJO,ADJD,EFG%,TO%,OR%,FTR,Opp EFG%,Opp TO%,Opp OR%,Opp FTR,opp_adj_o,opp_adj_d
0,Neutral,Auburn,Purdue,131.0,95.6,59.4,12.9,35.4,29.6,57.3,18.4,29.5,41.4,120.5,100.0
1,Home,Houston,Texas A&M Corpus Christi,119.7,88.6,52.1,14.6,38.0,27.4,54.9,17.8,30.9,40.5,107.0,106.4
2,Away,Duke,Georgia Tech,118.9,88.5,54.0,15.5,34.4,29.9,49.3,16.0,30.3,33.0,107.6,102.3
3,Home,Gonzaga,Bucknell,124.4,95.8,55.7,13.2,33.6,34.9,49.4,20.1,25.9,41.1,97.0,106.9
4,Home,Florida,North Florida,122.6,97.8,54.7,15.2,40.3,32.3,53.6,13.6,29.1,23.3,112.3,114.7
5,Neutral,Kentucky,Ohio State,122.7,98.2,57.2,13.1,31.5,31.6,57.1,15.7,28.2,34.0,114.6,99.7
6,Away,Marquette,Xavier,118.8,96.9,54.1,13.1,31.9,27.2,54.9,16.4,29.2,35.9,113.5,101.5
7,Neutral,UCLA,North Carolina,113.1,92.3,54.0,16.0,35.2,34.4,52.6,14.2,28.6,37.5,116.8,99.6
8,Neutral,Maryland,Syracuse,115.9,95.1,57.4,13.1,33.7,31.4,50.8,14.2,29.4,38.0,112.6,107.3
9,Away,Connecticut,Butler,123.4,101.5,57.9,14.7,35.7,33.5,53.6,18.9,29.9,48.4,113.4,102.9


In [22]:
predict_data["Location"] = predict_data["Location"].replace(
    {"Home": "H", "Away": "A", "Neutral": "N"}
)

In [23]:
predict_data_process = preprocess(predict_data)
predict_data_process = predict_data_process.drop(columns=["Team", "Opponent"])
predict_data_process = predict_data_process.fillna(0)
predict_data_scaled = scaler_X.transform(predict_data_process)

In [24]:
predictions_scaled = model.predict(predict_data_scaled)
predictions = scaler_y.inverse_transform(predictions_scaled)

# Add predictions to the original dataset
predict_data["Predicted_Team_Score"] = predictions[:, 0]
predict_data["Predicted_Opponent_Score"] = predictions[:, 1]

# # Save the results
# new_data.to_csv("new_data_with_predictions.csv", index=False)
# print(new_data.head())

In [26]:
predict_data[["Team", "Opponent", "Predicted_Team_Score", "Predicted_Opponent_Score"]]

Unnamed: 0,Team,Opponent,Predicted_Team_Score,Predicted_Opponent_Score
0,Auburn,Purdue,87.507139,84.427364
1,Houston,Texas A&M Corpus Christi,80.711696,70.778477
2,Duke,Georgia Tech,81.396355,75.362344
3,Gonzaga,Bucknell,86.107843,65.654251
4,Florida,North Florida,80.963625,71.201027
5,Kentucky,Ohio State,88.175545,89.449107
6,Marquette,Xavier,81.195026,80.476873
7,UCLA,North Carolina,84.4509,85.381658
8,Maryland,Syracuse,85.962079,78.587272
9,Connecticut,Butler,83.794466,78.759116
