In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [5]:
# Load the team data
team_df_dropped_row = pd.read_parquet('parquet_data/team_df_missing_handled.parquet')
team_df_dropped_row.head()

Unnamed: 0,Date,Opponent,Result,S,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Assists,PTS,name
0,08/26/2016,"Prairie View @ Waco, Texas",W 3 - 0,3.0,46.0,13.0,107.0,0.308,40.0,6.0,11.0,51.0,3.0,2.0,53.0,A&M-Corpus Christi (Southland)
1,08/27/2016,@ Baylor,L 1 - 3,4.0,37.0,20.0,119.0,0.143,32.0,5.0,6.0,46.0,5.0,18.0,53.0,A&M-Corpus Christi (Southland)
2,09/03/2016,"San Diego @ Madison, Wis.",L 0 - 3,3.0,30.0,16.0,94.0,0.149,29.0,1.0,3.0,32.0,1.0,15.0,42.5,A&M-Corpus Christi (Southland)
3,09/06/2016,UTRGV,W 3 - 1,4.0,53.0,20.0,158.0,0.209,49.0,3.0,9.0,66.0,2.0,16.0,66.0,A&M-Corpus Christi (Southland)
4,09/09/2016,UNLV,L 1 - 3,4.0,44.0,30.0,166.0,0.084,42.0,1.0,3.0,66.0,2.0,8.0,52.0,A&M-Corpus Christi (Southland)


In [6]:
# Create the 'Success' column for team data
team_df_dropped_row['Success'] = team_df_dropped_row['Result'].apply(
    lambda x: 1 if isinstance(x, str) and x.startswith('W') else (0 if isinstance(x, str) and x.startswith('L') else None)
)

# Drop rows with missing values in 'Success'
team_df_dropped_row = team_df_dropped_row.dropna(subset=['Success'])

In [7]:
# Define features (X) and target (y) for team data
X_team = team_df_dropped_row[['S', 'Kills', 'Errors', 'Total Attacks', 'Hit Pct', 'Assists', 
                              'Aces', 'SErr', 'Digs', 'RErr', 'Block Assists', 'PTS']]
y_team = team_df_dropped_row['Success']

# Drop rows with missing values in the feature set
X_team = X_team.dropna()
y_team = y_team[X_team.index]  # Align target with the feature set indices


In [8]:
# Standardize the features for team data
scaler_team = StandardScaler()
X_team_scaled = scaler_team.fit_transform(X_team)

# Split the team data into training and testing sets
X_train_team, X_test_team, y_train_team, y_test_team = train_test_split(
    X_team_scaled, y_team, test_size=0.2, random_state=42
)


In [9]:
# Train Logistic Regression model for team data
model_team = LogisticRegression(random_state=42)
model_team.fit(X_train_team, y_train_team)

# Predict on test data for team data
y_pred_team = model_team.predict(X_test_team)

# Print classification report for the team dataset
print("Classification Report for Team Data:")
print(classification_report(y_test_team, y_pred_team))


Classification Report for Team Data:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      3507
           1       0.87      0.87      0.87      3564

    accuracy                           0.87      7071
   macro avg       0.87      0.87      0.87      7071
weighted avg       0.87      0.87      0.87      7071



for player dataset

In [10]:
# Load the player data
player_df_dropped_row = pd.read_parquet('parquet_data/player_df_missing_handled.parquet')

# Create the 'Success' column for player data
player_df_dropped_row['Success'] = player_df_dropped_row['Result'].apply(
    lambda x: 1 if isinstance(x, str) and x.startswith('W') else (0 if isinstance(x, str) and x.startswith('L') else None)
)

# Drop rows with missing values in 'Success'
player_df_dropped_row = player_df_dropped_row.dropna(subset=['Success'])


In [11]:
# Define features (X_player) and target (y_player) for player data
X_player = player_df_dropped_row[['S', 'Kills', 'Errors', 'Total Attacks', 'Hit Pct', 'Assists', 
                                  'SErr', 'Digs', 'Block Assists', 'PTS']]
y_player = player_df_dropped_row['Success']

# Drop rows with missing values in the feature set
X_player = X_player.dropna()
y_player = y_player[X_player.index]  # Align target with the feature set indices

# Standardize the features for player data
scaler_player = StandardScaler()
X_player_scaled = scaler_player.fit_transform(X_player)

In [12]:
# Split the player data into training and testing sets
X_train_player, X_test_player, y_train_player, y_test_player = train_test_split(
    X_player_scaled, y_player, test_size=0.2, random_state=42
)

# Train Logistic Regression model for player data
model_player = LogisticRegression(random_state=42)
model_player.fit(X_train_player, y_train_player)

# Predict on test data for player data
y_pred_player = model_player.predict(X_test_player)

# Print classification report for the player dataset
print("Classification Report for Player Data:")
print(classification_report(y_test_player, y_pred_player))


Classification Report for Player Data:
              precision    recall  f1-score   support

           0       0.61      0.45      0.52      2682
           1       0.65      0.77      0.70      3479

    accuracy                           0.63      6161
   macro avg       0.63      0.61      0.61      6161
weighted avg       0.63      0.63      0.62      6161

