## Data Collection and Cleaning

In [3]:
!pip install numpy



First we get a collection of the last years worth of player injuries 

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

def nba_injuries(start_date="2022-01-01", end_date="2023-01-01", player="", team=""):
    try:
        player = player.replace(" ", "+")
        base_url = "https://www.prosportstransactions.com/basketball/Search/SearchResults.php"
        css_selector = ".datatable"
        start = 0
        all_pages_data = []

        while True:
            params = {
                "Player": player,
                "Team": team,
                "BeginDate": start_date,
                "EndDate": end_date,
                "ILChkBx": "yes",
                "InjuriesChkBx": "yes",
                "PersonalChkBx": "yes",
                "Submit": "Search",
                "start": start
            }

            response = requests.get(base_url, params=params)
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.select_one(css_selector)

            if table:
                df = pd.read_html(str(table))[0]
                df.columns = df.iloc[0]  # Set the first row as column names
                df = df[1:]  # Remove the first row
                all_pages_data.append(df)

                # Check for the 'Next' link - if it doesn't exist, break the loop
                if not soup.find('a', text='Next'):
                    break
                
                start += 25  # Update start for next page
            else:
                break

        # Combine all pages data into one DataFrame
        if all_pages_data:
            final_df = pd.concat(all_pages_data, ignore_index=True)
            final_df['Date'] = pd.to_datetime(final_df['Date'])
            final_df['Acquired'] = final_df['Acquired'].str.replace("… ", "")
            final_df['Relinquished'] = final_df['Relinquished'].str.replace("… ", "")
            return final_df
        else:
            print("No data found for the given parameters.")
            return pd.DataFrame()  # Return empty DataFrame if no data found

    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return empty DataFrame in case of error

# Example usage
df = nba_injuries()


  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.find('a', text='Next'):
  df = pd.read_html(str(table))[0]
  if not soup.fi

In [5]:
df.to_csv('out.csv', index=False) 

In [6]:
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])
current_date = pd.to_datetime('2023-01-01')

# Melt the DataFrame to unify 'Relinquished' and 'Acquired' under a single 'Player' column
latest_dates = df.melt(id_vars='Date', value_vars=['Relinquished', 'Acquired'], var_name='Status', value_name='Player')
latest_dates = latest_dates.dropna(subset=['Player']).sort_values(by='Date', ascending=False)
latest_dates = latest_dates.drop_duplicates(subset=['Player'], keep='first')

# Create a 'Got Injured' column: 1 if 'Relinquished', 0 if 'Acquired'
latest_dates['Got Injured'] = (latest_dates['Status'] == 'Relinquished').astype(int)

# Aggregate original data for players listed in 'Relinquished'
grouped_df = df.dropna(subset=['Relinquished']).groupby('Relinquished').agg({
    'Date': lambda x: (current_date - max(x)).days,
    'Notes': lambda x: list(x.dropna())
}).reset_index()

# Merge the aggregated data with the latest_dates to include the 'Got Injured' status
grouped_df = grouped_df.merge(latest_dates[['Player', 'Got Injured']], left_on='Relinquished', right_on='Player', how='left')

# Rename columns for clarity and avoid deleting necessary columns
grouped_df = grouped_df.rename(columns={'Relinquished': 'Player', 'Date': 'Days Since Last Injury', 'Notes': 'List of Past Injuries','Player':'DROP'})
grouped_df.drop(columns = ["DROP"],inplace = True)
grouped_df

Unnamed: 0,Player,Days Since Last Injury,List of Past Injuries,Got Injured
0,• (Sean) Chris Smith,284,"[sprained left knee (DTD), surgery on left kne...",0
1,• A.J. Green,64,[placed on IL with fractured nose],0
2,• Aaron Gordon,5,"[placed on IL with left hamstring injury, plac...",0
3,• Aaron Holiday,310,[placed on IL with sore right ankle],0
4,• Aaron Nesmith,9,"[placed on IL with sprained right ankle, place...",0
...,...,...,...,...
483,• Zach Collins,323,[placed on IL with left ankle injury],0
484,• Zach LaVine,65,"[placed on IL with left knee injury, placed on...",0
485,• Zeke Nnaji,53,"[placed on IL with right hamstring injury, pla...",0
486,• Ziaire Williams,74,"[placed on IL with sore left knee, sore right ...",1


In [7]:
injury_types = ["core", "upper body", "ligament", "foot", "back"]

def filter_injury_types(injuries_list):
    """Filters a list of injuries to include only predefined injury types."""
    filtered_types = []
    for injury in injuries_list:
        for injury_type in injury_types:
            if injury_type in injury.lower():
                filtered_types.append(injury_type)
    return filtered_types

#grouped_df['Injury Types'] = grouped_df['List of Past Injuries'].apply(has_injury_type)

# Apply the function to filter and extract relevant injury types
grouped_df['Filtered Injuries'] = grouped_df['List of Past Injuries'].apply(filter_injury_types)

# Explode the list of filtered injuries into separate rows
exploded_df = grouped_df.explode('Filtered Injuries')

# Create dummy variables for each injury type
injury_dummies = pd.get_dummies(exploded_df['Filtered Injuries'])
grouped_df = grouped_df.join(injury_dummies.groupby(level=0).sum())

# Ensure all injury types are represented in the DataFrame
for injury_type in injury_types:
    if injury_type not in grouped_df.columns:
        grouped_df[injury_type] = 0

# Optionally, select only the injury type columns
grouped_df = grouped_df[['Player', 'Days Since Last Injury',"Got Injured"] + injury_types]


# Display the updated DataFrame
print(grouped_df)

                   Player  Days Since Last Injury  Got Injured  core  \
0    • (Sean) Chris Smith                     284            0     0   
1            • A.J. Green                      64            0     0   
2          • Aaron Gordon                       5            0     0   
3         • Aaron Holiday                     310            0     0   
4         • Aaron Nesmith                       9            0     0   
..                    ...                     ...          ...   ...   
483        • Zach Collins                     323            0     0   
484         • Zach LaVine                      65            0     0   
485          • Zeke Nnaji                      53            0     0   
486     • Ziaire Williams                      74            1     0   
487     • Zion Williamson                      10            0     0   

     upper body  ligament  foot  back  
0             0         0     0     0  
1             0         0     0     0  
2             0

In [8]:
grouped_df['Player'] = grouped_df['Player'].str.replace('•', '')
#grouped_df["Got Injured"] = grouped_df['Days Since Last Injury'] <= 3
#grouped_df["Got Injured"] = grouped_df["Got Injured"].astype(int)
grouped_df.head()

Unnamed: 0,Player,Days Since Last Injury,Got Injured,core,upper body,ligament,foot,back
0,(Sean) Chris Smith,284,0,0,0,0,0,0
1,A.J. Green,64,0,0,0,0,0,0
2,Aaron Gordon,5,0,0,0,0,1,0
3,Aaron Holiday,310,0,0,0,0,0,0
4,Aaron Nesmith,9,0,0,0,0,1,0


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def fetch_last_5_games_data(player_url):
    response = requests.get(player_url)
    if response.status_code != 200:
        print(f"Failed to retrieve the web page. Status code: {response.status_code}")
        return None
    if response.status_code == 429:
            retry_after = int(response.headers.get("Retry-After", 10))  # Default to 30 seconds if header is missing
            print(f"Rate limit reached, retrying after {retry_after} seconds")
            print("Timer:", retry_after)
            time.sleep(retry_after)
            

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the div containing the last 5 games data
    last5_div = soup.find('div', id='all_last5')
    if not last5_div:
        print("Last 5 games div not found.")
        return None

    # The table might be directly inside the div
    table = last5_div.find('table')
    if not table:
        print("Last 5 games table not found.")
        return None

    # Parse the table with pandas
    df = pd.read_html(str(table))[0]
    
    # Clean and process the DataFrame to have the desired columns
    
    df['GameLocation'] = df.apply(lambda row: row['Opp'].replace('@', '').strip() 
                                  if '@' in row['Opp'] else row['Team'], axis=1)

    # Clean the 'MP' column, which represents minutes played
    df['MP'] = df['MP']

    # Select and rename the relevant columns
    relevant_data = df[['Date', 'MP', 'GameLocation']].rename(columns={
        'MP': 'TimePlayed'
    })

    return relevant_data

# Example usage
player_url = "https://www.basketball-reference.com/players/g/greenje02.html"
last_5_games_df = fetch_last_5_games_data(player_url)
print(last_5_games_df)



         Date  TimePlayed GameLocation
0  2024-04-14          12          HOU
1  2024-04-12          17          HOU
2  2024-04-11          22          HOU
3  2024-04-09          17          HOU
4  2024-04-07          17          HOU


  df = pd.read_html(str(table))[0]


In [10]:
import requests
from bs4 import BeautifulSoup

def search_for_player_page(player_name):
    # Encode the player's name for use in a URL
    query = player_name.replace(' ', '+')

    # The base URL for the basketball-reference search
    search_url = f'https://www.basketball-reference.com/search/search.fcgi?search={query}'
    
    # Perform the search request
    response = requests.get(search_url)
    
    # If the response URL is a player page, return it directly
    if 'players' in response.url:
        return response.url
    
    if response.status_code != 200:
        print(f"Failed to retrieve the search page. Status code: {response.status_code}")
        return None

    # Parse the search results page to find links
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Look for the div that contains the search results
    search_results = soup.find('div', id='players')
    
    if search_results:
        # Attempt to find an anchor element within the search results
        player_link = search_results.find('a')
        
        if player_link and 'href' in player_link.attrs:
            # Construct the full URL to the player's page
            player_page_url = 'https://www.basketball-reference.com' + player_link['href']
            return player_page_url
        else:
            print("No player link found in the search results.")
            return None
    else:
        print("No search results found for the player.")
        return None

# Example usage
player_name = "LeBron James"
player_page_url = search_for_player_page(player_name)
print(f"Player page URL: {player_page_url}")

Player page URL: https://www.basketball-reference.com/players/j/jamesle01.html


In [10]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

def fetch_player_info(player_url):
    response = requests.get(player_url)
    if response.status_code != 200:
        return f"Failed to retrieve the web page. Status code: {response.status_code}"

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting the player's name from the <h1> tag
    player_name = soup.select_one('.players h1').text.strip() if soup.select_one('.players h1') else "Unknown"

    # Extracting the position, height, weight, and birth date from the player's meta information
    meta_info = soup.find('div', id='meta')

    # Position
    # Position
    position_tag = meta_info.find('strong', text=lambda x: x and 'Position:' in x)
    position = position_tag.next_sibling.strip() if position_tag and position_tag.next_sibling else "Unknown"


    # Height and Weight
    
    height_weight_text = meta_info.find('span', text=lambda x: x and 'lb' in x)
    height = height_weight_text.next_sibling.strip() if height_weight_text else "Unknown"
    weight = height_weight_text.text.strip() if height_weight_text else "Unknown"
    
    # Birth date for age calculation
    birth_date_tag = meta_info.find('span', id='necro-birth')
    birth_date_str = birth_date_tag['data-birth'] if birth_date_tag else "Unknown"

    if birth_date_str != "Unknown":
        birth_date = datetime.strptime(birth_date_str, '%Y-%m-%d').date()
        today = datetime.now().date()
        age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
    else:
        age = "Unknown"

    # Weight
    weight_match = re.search(r"\d+", weight)
    weight = int(weight_match.group()) if weight_match else "Unknown"

    # Experience
    experience = meta_info.find('strong', text='Experience:').next_sibling.strip() if meta_info.find('strong', text='Experience:') else "Unknown"
    experience_match = re.search(r"\d+", experience)
    experience = int(experience_match.group()) if experience_match else "Unknown"

    player_data = {
        'Name': player_name,
        'Position': position.split(",")[0],
        'Height': int(re.search(r"(\d+)cm", height).group(1)),
        'Weight': weight,
        'Age': age,
        'Experience': experience
    }

    return player_data

# Example usage
player_url = "https://www.basketball-reference.com/players/j/jamesle01.html"
player_info = fetch_player_info(player_url)
print(player_info)
print(type(player_info))


{'Name': 'LeBron James', 'Position': 'Small Forward', 'Height': 206, 'Weight': 250, 'Age': 39, 'Experience': 20}
<class 'dict'>


  position_tag = meta_info.find('strong', text=lambda x: x and 'Position:' in x)
  height_weight_text = meta_info.find('span', text=lambda x: x and 'lb' in x)
  experience = meta_info.find('strong', text='Experience:').next_sibling.strip() if meta_info.find('strong', text='Experience:') else "Unknown"


In [9]:
import time
import pandas as pd

basic_info = pd.DataFrame(columns=['Name', 'Position', 'Height', 'Weight', 'Age', 'Experience'])
last_five_games = pd.DataFrame(columns=['Name', 'Last_5_Games'])

# Default values for last 5 games, assuming you expect 5 entries
default_game_data = [None] * 5  # Replace None with a more appropriate default if needed

for name in grouped_df["Player"]:
    time.sleep(10)  
    print(f"Processing {name}")
    my_url = search_for_player_page(name)
    if not my_url:
        continue
    my_basic_info = fetch_player_info(my_url)
    my_five_games = fetch_last_5_games_data(my_url)

    # If data is fetched, use it, otherwise stick with defaults
    actual_game_data = list(my_five_games["TimePlayed"]) if my_five_games is not None else default_game_data

    # Ensure the list is always of length 5
    actual_game_data = actual_game_data[:5] + [None] * (5 - len(actual_game_data))

    # Dictionary for DataFrame
    # Proposed addendum: five_games_df = {"Name": [name], "Last_5_Games": [actual_game_data], "Location of Games": [my_five_games["Location"]]}
    five_games_df = {"Name": [name], "Last_5_Games": [actual_game_data]}
    my_basic_info = {key: [value] for key, value in my_basic_info.items()}
    
    # Create and append to DataFrame as before
    basic_info = pd.concat([basic_info, pd.DataFrame(my_basic_info)], ignore_index=True)
    last_five_games = pd.concat([last_five_games, pd.DataFrame(five_games_df)], ignore_index=True)

# Save DataFrames to CSV
basic_info.to_csv("basic_info.csv", encoding='utf-8', index=False)
last_five_games.to_csv("last_five_games.csv", encoding='utf-8', index=False)


Processing  (Sean) Chris Smith
No search results found for the player.
Processing  A.J. Green


  position_tag = meta_info.find('strong', text=lambda x: x and 'Position:' in x)
  height_weight_text = meta_info.find('span', text=lambda x: x and 'lb' in x)
  experience = meta_info.find('strong', text='Experience:').next_sibling.strip() if meta_info.find('strong', text='Experience:') else "Unknown"
  df = pd.read_html(str(table))[0]


KeyboardInterrupt: 

In [11]:
basic_info = pd.read_csv("basic_info.csv") # run if you accidentally  closed the editor lol
last_5_games_df = pd.read_csv("last_five_games.csv")
basic_info

Unnamed: 0,Name,Position,Height,Weight,Age,Experience
0,A.J. Green,Shooting Guard\n\n\n \n ▪,193,200,24,1
1,Aaron Gordon,Power Forward and Small Forward\n\n\n \n ▪,203,235,28,9
2,Aaron Holiday,Point Guard\n\n\n \n ▪,183,185,27,5
3,Aaron Nesmith,Small Forward\n\n\n \n ▪,196,215,24,3
4,Aaron Wiggins,Shooting Guard\n\n\n \n ▪,198,200,25,2
...,...,...,...,...,...,...
448,Zach Collins,Center and Power Forward\n\n\n \n ▪,211,250,26,5
449,Zach LaVine,Shooting Guard,196,200,29,9
450,Zeke Nnaji,Power Forward\n\n\n \n ▪,206,240,23,3
451,Ziaire Williams,Small Forward\n\n\n \n ▪,206,215,22,2


In [12]:

# Normalize and clean the position strings if they are not lists yet
basic_info['Position'] = basic_info['Position'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Specify the positions you are interested in
positions = ['Point Guard', 'Shooting Guard', 'Power Forward', 'Small Forward', 'Center']

# Create dummy variables for each position
for position in positions:
    basic_info[position] = basic_info['Position'].apply(lambda x: 1 if position in x else 0)

# Display the result to verify
print(basic_info.head())


            Name                                      Position  Height  \
0     A.J. Green                   Shooting Guard\n\n\n  \n  ▪     193   
1   Aaron Gordon  Power Forward and Small Forward\n\n\n  \n  ▪     203   
2  Aaron Holiday                      Point Guard\n\n\n  \n  ▪     183   
3  Aaron Nesmith                    Small Forward\n\n\n  \n  ▪     196   
4  Aaron Wiggins                   Shooting Guard\n\n\n  \n  ▪     198   

   Weight Age Experience  Point Guard  Shooting Guard  Power Forward  \
0     200  24          1            0               1              0   
1     235  28          9            0               0              1   
2     185  27          5            1               0              0   
3     215  24          3            0               0              0   
4     200  25          2            0               1              0   

   Small Forward  Center  
0              0       0  
1              1       0  
2              0       0  
3             

In [13]:
basic_info

Unnamed: 0,Name,Position,Height,Weight,Age,Experience,Point Guard,Shooting Guard,Power Forward,Small Forward,Center
0,A.J. Green,Shooting Guard\n\n\n \n ▪,193,200,24,1,0,1,0,0,0
1,Aaron Gordon,Power Forward and Small Forward\n\n\n \n ▪,203,235,28,9,0,0,1,1,0
2,Aaron Holiday,Point Guard\n\n\n \n ▪,183,185,27,5,1,0,0,0,0
3,Aaron Nesmith,Small Forward\n\n\n \n ▪,196,215,24,3,0,0,0,1,0
4,Aaron Wiggins,Shooting Guard\n\n\n \n ▪,198,200,25,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
448,Zach Collins,Center and Power Forward\n\n\n \n ▪,211,250,26,5,0,0,1,0,1
449,Zach LaVine,Shooting Guard,196,200,29,9,0,1,0,0,0
450,Zeke Nnaji,Power Forward\n\n\n \n ▪,206,240,23,3,0,0,1,0,0
451,Ziaire Williams,Small Forward\n\n\n \n ▪,206,215,22,2,0,0,0,1,0


In [14]:
grouped_df

Unnamed: 0,Player,Days Since Last Injury,Got Injured,core,upper body,ligament,foot,back
0,(Sean) Chris Smith,284,0,0,0,0,0,0
1,A.J. Green,64,0,0,0,0,0,0
2,Aaron Gordon,5,0,0,0,0,1,0
3,Aaron Holiday,310,0,0,0,0,0,0
4,Aaron Nesmith,9,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...
483,Zach Collins,323,0,0,0,0,0,0
484,Zach LaVine,65,0,0,0,0,0,1
485,Zeke Nnaji,53,0,0,0,0,0,0
486,Ziaire Williams,74,1,0,0,0,0,0


In [15]:
# Run this if you are importing from the csv
import ast
past_games_data_temp = last_5_games_df.copy()

past_games_data_temp['Last_5_Games'] = past_games_data_temp['Last_5_Games'].apply(ast.literal_eval)

# Expand the lists into separate columns
past_games_data_temp[['Game_1', 'Game_2', 'Game_3', 'Game_4', 'Game_5']] = pd.DataFrame(past_games_data_temp['Last_5_Games'].tolist(), index=past_games_data_temp.index)

# Optionally drop the original 'Last_5_Games' column
past_games_data_temp.drop('Last_5_Games', axis=1, inplace=True)

In [16]:
basic_info

Unnamed: 0,Name,Position,Height,Weight,Age,Experience,Point Guard,Shooting Guard,Power Forward,Small Forward,Center
0,A.J. Green,Shooting Guard\n\n\n \n ▪,193,200,24,1,0,1,0,0,0
1,Aaron Gordon,Power Forward and Small Forward\n\n\n \n ▪,203,235,28,9,0,0,1,1,0
2,Aaron Holiday,Point Guard\n\n\n \n ▪,183,185,27,5,1,0,0,0,0
3,Aaron Nesmith,Small Forward\n\n\n \n ▪,196,215,24,3,0,0,0,1,0
4,Aaron Wiggins,Shooting Guard\n\n\n \n ▪,198,200,25,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
448,Zach Collins,Center and Power Forward\n\n\n \n ▪,211,250,26,5,0,0,1,0,1
449,Zach LaVine,Shooting Guard,196,200,29,9,0,1,0,0,0
450,Zeke Nnaji,Power Forward\n\n\n \n ▪,206,240,23,3,0,0,1,0,0
451,Ziaire Williams,Small Forward\n\n\n \n ▪,206,215,22,2,0,0,0,1,0


In [17]:
# Preprocessing data
import torch
import ast
from torch import nn
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import ADASYN




grouped_df['Player'] = grouped_df['Player'].str.upper().str.strip()
basic_info['Name'] = basic_info['Name'].str.upper().str.strip()

filtered_grouped_df = grouped_df[grouped_df['Player'].isin(basic_info['Name'])]
current_game_player_data = basic_info.copy()
past_injuries_data_temp = filtered_grouped_df.copy()
past_games_data_temp = last_5_games_df.copy()

past_games_data_temp['Last_5_Games'] = past_games_data_temp['Last_5_Games'].apply(ast.literal_eval)

# Expand the lists into separate columns
past_games_data_temp[['Game_1', 'Game_2', 'Game_3', 'Game_4', 'Game_5']] = pd.DataFrame(past_games_data_temp['Last_5_Games'].tolist(), index=past_games_data_temp.index)

# Optionally drop the original 'Last_5_Games' column
past_games_data_temp.drop('Last_5_Games', axis=1, inplace=True)

past_games_data = past_games_data_temp
past_injuries_data = past_injuries_data_temp.drop('Got Injured', axis=1)  # Drop the label column to create features
#past_injuries_data.drop('List of Past Injuries', axis = 1, inplace= True)
labels = past_injuries_data_temp['Got Injured']  # Isolate the label column

past_games_data.drop('Name', axis =1, inplace=True)
if past_games_data_temp.isnull().any().any():
    past_games_data_temp.fillna(0, inplace=True)  # Or another appropriate fill value

past_injuries_data.drop('Player', axis =1, inplace = True)
current_game_player_data.drop('Name', axis =1, inplace =True)
current_game_player_data.drop('Position', axis =1, inplace =True)
# Convert 'Age' and 'Experience' to numeric types, coercing errors to NaN (or a default value)
current_game_player_data['Age'] = pd.to_numeric(current_game_player_data['Age'], errors='coerce')
current_game_player_data['Experience'] = pd.to_numeric(current_game_player_data['Experience'], errors='coerce')

# Optionally fill NaNs with a default value, such as 0 or the mean/median of the column
current_game_player_data['Age'].fillna(0, inplace=True)
current_game_player_data['Experience'].fillna(0, inplace=True)

# Convert to integer if that is more appropriate
current_game_player_data['Age'] = current_game_player_data['Age'].astype(int)
current_game_player_data['Experience'] = current_game_player_data['Experience'].astype(int)


# Convert DataFrames/Series to tensors
past_injuries_tensor = torch.tensor(past_injuries_data.values, dtype=torch.float32)
past_games_tensor = torch.tensor(past_games_data.values, dtype=torch.float32)
current_game_player_tensor = torch.tensor(current_game_player_data.values, dtype=torch.float32)
labels_tensor = torch.tensor(labels.values, dtype=torch.long)  # Assuming labels are categorical for CrossEntropyLoss

# Split the data into training and testing sets
train_idx, test_idx = train_test_split(
    range(len(labels_tensor)),
    test_size=0.2,
    stratify=labels_tensor
)

# Creating training and testing datasets using indices
train_dataset = TensorDataset(
    past_injuries_tensor[train_idx],
    past_games_tensor[train_idx],
    current_game_player_tensor[train_idx],
    labels_tensor[train_idx]
)

test_dataset = TensorDataset(
    past_injuries_tensor[test_idx],
    past_games_tensor[test_idx],
    current_game_player_tensor[test_idx],
    labels_tensor[test_idx]
)

def manual_oversample(features_tensors, labels_tensor):
    # Convert tensors to numpy arrays for easier manipulation
    features_np = [tensor.numpy() for tensor in features_tensors]
    labels_np = labels_tensor.numpy()
    
    # Find the unique classes and their counts
    classes, counts = np.unique(labels_np, return_counts=True)
    max_count = np.max(counts)
    
    # Determine the indices to replicate for each class to balance the dataset
    resampled_features = [list() for _ in features_tensors]  # Use lists to collect arrays
    resampled_labels = []

    for cls in classes:
        cls_indices = np.where(labels_np == cls)[0]
        oversample_indices = np.random.choice(cls_indices, size=max_count, replace=True)
        resampled_labels.extend(labels_np[oversample_indices])
        for i, feature_set in enumerate(features_np):
            resampled_features[i].extend(feature_set[oversample_indices])

    # Convert list of arrays into correctly shaped numpy arrays and then to tensors
    resampled_features_tensors = [torch.tensor(np.array(feature), dtype=torch.float32) for feature in resampled_features]
    resampled_labels_tensor = torch.tensor(resampled_labels, dtype=torch.long)
    
    return resampled_features_tensors, resampled_labels_tensor

# Apply manual oversampling to the training data
resampled_features_tensors, resampled_labels_tensor = manual_oversample(
    [past_injuries_tensor[train_idx], past_games_tensor[train_idx], current_game_player_tensor[train_idx]],
    labels_tensor[train_idx]
)

# Create the new balanced training dataset
balanced_train_dataset = TensorDataset(
    *resampled_features_tensors,  # Unpack the list of tensors
    resampled_labels_tensor
)


: 

: 

In [None]:
current_game_player_tensor.shape

torch.Size([453, 9])

In [None]:
# Trying and implementing SMOTE
import torch

# Count class occurrences
unique, counts = labels_tensor.unique(return_counts=True)
class_distribution = dict(zip(unique.tolist(), counts.tolist()))

# Identify the minority class
minority_class = min(class_distribution, key=class_distribution.get)

minority_indices = (labels_tensor == minority_class).nonzero(as_tuple=True)[0]

# Tensors of the minority class
minority_injuries = past_injuries_tensor[minority_indices]
minority_games = past_games_tensor[minority_indices]
minority_players = current_game_player_tensor[minority_indices]

from sklearn.neighbors import NearestNeighbors
import numpy as np
def smote(data, n_samples, k=5):
    n_minority_samples, n_features = data.shape
    synthetic_samples = np.zeros((n_samples, n_features))

    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(data)
    
    for i in range(n_samples):
        idx = np.random.randint(0, n_minority_samples)
        sample = data[idx]
       
        print("Sample shape:", sample.shape)  
        # Ensure that the sample is properly shaped
        sample = sample.reshape(1, -1)
        _, nn_indices = neigh.kneighbors(sample)
        print('here')
        nn_idx = np.random.choice(nn_indices[0])
        print(nn_idx)
        # Create the synthetic sample
        diff = data[nn_idx] - data[idx]
        synthetic_samples[i] = data[idx] + np.random.rand() * diff
    
    return synthetic_samples

# Number of samples to generate
n_samples = 100  # adjust based on your requirement

# Convert tensors to numpy arrays for SMOTE
minority_injuries_np = minority_injuries.numpy()
minority_games_np = minority_games.numpy()
minority_players_np = minority_players.numpy()


synthetic_injuries = smote(minority_injuries_np, n_samples)
synthetic_games = smote(minority_games_np, n_samples)
synthetic_players = smote(minority_players_np, n_samples)

synthetic_injuries_tensor = torch.tensor(synthetic_injuries, dtype=torch.float32)
synthetic_games_tensor = torch.tensor(synthetic_games, dtype=torch.float32)
synthetic_players_tensor = torch.tensor(synthetic_players, dtype=torch.float32)
synthetic_labels_tensor = torch.tensor([minority_class] * n_samples, dtype=torch.long)

# Combine the original and synthetic data
combined_injuries_tensor = torch.cat((past_injuries_tensor, synthetic_injuries_tensor), 0)
combined_games_tensor = torch.cat((past_games_tensor, synthetic_games_tensor), 0)
combined_players_tensor = torch.cat((current_game_player_tensor, synthetic_players_tensor), 0)
combined_labels_tensor = torch.cat((labels_tensor, synthetic_labels_tensor), 0)

# Shuffle the dataset
shuffled_indices = torch.randperm(combined_labels_tensor.size(0))

# Re-assign the shuffled data
shuffled_injuries = combined_injuries_tensor[shuffled_indices]
shuffled_games = combined_games_tensor[shuffled_indices]
shuffled_players = combined_players_tensor[shuffled_indices]
shuffled_labels = combined_labels_tensor[shuffled_indices]

# Split the shuffled data into training and testing sets
train_idx, test_idx = train_test_split(
    range(len(shuffled_labels)),
    test_size=0.2,
    stratify=shuffled_labels.numpy()  # Convert tensor to numpy for stratify compatibility
)

# Convert indices to torch tensors
train_idx = torch.tensor(train_idx, dtype=torch.long)
test_idx = torch.tensor(test_idx, dtype=torch.long)

# Create the training and testing datasets using the new indices
train_dataset_smote = TensorDataset(
    shuffled_injuries[train_idx],
    shuffled_games[train_idx],
    shuffled_players[train_idx],
    shuffled_labels[train_idx]
)

test_dataset_smote = TensorDataset(
    shuffled_injuries[test_idx],
    shuffled_games[test_idx],
    shuffled_players[test_idx],
    shuffled_labels[test_idx]
)


Sample shape: (6,)
here
73
Sample shape: (6,)
here
7
Sample shape: (6,)
here
28
Sample shape: (6,)
here
104
Sample shape: (6,)
here
51
Sample shape: (6,)
here
43
Sample shape: (6,)
here
50
Sample shape: (6,)
here
73
Sample shape: (6,)
here
72
Sample shape: (6,)
here
22
Sample shape: (6,)
here
28
Sample shape: (6,)
here
49
Sample shape: (6,)
here
76
Sample shape: (6,)
here
49
Sample shape: (6,)
here
82
Sample shape: (6,)
here
60
Sample shape: (6,)
here
51
Sample shape: (6,)
here
95
Sample shape: (6,)
here
19
Sample shape: (6,)
here
2
Sample shape: (6,)
here
38
Sample shape: (6,)
here
46
Sample shape: (6,)
here
77
Sample shape: (6,)
here
72
Sample shape: (6,)
here
91
Sample shape: (6,)
here
85
Sample shape: (6,)
here
101
Sample shape: (6,)
here
100
Sample shape: (6,)
here
51
Sample shape: (6,)
here
2
Sample shape: (6,)
here
5
Sample shape: (6,)
here
23
Sample shape: (6,)
here
104
Sample shape: (6,)
here
32
Sample shape: (6,)
here
36
Sample shape: (6,)
here
89
Sample shape: (6,)
here
83
S

In [None]:
!pip install matplotlib



In [None]:
# Defining models and helper functions
# Define DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset_smote, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_smote, batch_size=batch_size, shuffle=True)

class TransformerModel_PI(nn.Module):
    def __init__(self):
        super(TransformerModel_PI, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=6, nhead=3,dropout = 0.1)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)
        self.init_weights()

    def forward(self, src):
        src = self.encoder_layer(src)
        if torch.isnan(src).any():
            print("NaNs detected after encoder layer")
        encoded = self.transformer_encoder(src)
        if torch.isnan(encoded).any():
            print("NaNs detected after transformer encoder")
        output = encoded.mean(dim=1, keepdim=True)
        if torch.isnan(output).any():
            print("NaNs detected after mean pooling")
        return output
        #encoded = self.transformer_encoder(src)
        #return encoded.mean(dim=1, keepdim=True)  # average pooling

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

class TransformerModel_PG(nn.Module):
    def __init__(self):
        super(TransformerModel_PG, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=5, nhead=5,dropout = 0.1)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)
        self.init_weights()

    def forward(self, src):
        src = self.encoder_layer(src)
        if torch.isnan(src).any():
            print("NaNs detected after encoder layer")
        encoded = self.transformer_encoder(src)
        if torch.isnan(encoded).any():
            print("NaNs detected after transformer encoder")
        output = encoded.mean(dim=1, keepdim=True)
        if torch.isnan(output).any():
            print("NaNs detected after mean pooling")
        return output
        #encoded = self.transformer_encoder(src)
        #return encoded.mean(dim=1, keepdim=True)  # average pooling

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

class MultilayerPerceptron(nn.Module):
    def __init__(self, input_dim):
        super(MultilayerPerceptron, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 2)  # Binary classification
        self.init_weights()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

def ensure_2d(tensor):
    """ Ensure tensor is 2D, adding a batch dimension if necessary. """
    if tensor.dim() == 1:
        tensor = tensor.unsqueeze(0)  # Add a batch dimension if missing
    return tensor

In [None]:
!pip uninstall torch
!pip install torch


Found existing installation: torch 2.2.2
Uninstalling torch-2.2.2:
  Would remove:
    /Users/michaeldelarosa/opt/anaconda3/bin/convert-caffe2-to-onnx
    /Users/michaeldelarosa/opt/anaconda3/bin/convert-onnx-to-caffe2
    /Users/michaeldelarosa/opt/anaconda3/bin/torchrun
    /Users/michaeldelarosa/opt/anaconda3/lib/python3.9/site-packages/functorch/*
    /Users/michaeldelarosa/opt/anaconda3/lib/python3.9/site-packages/torch-2.2.2.dist-info/*
    /Users/michaeldelarosa/opt/anaconda3/lib/python3.9/site-packages/torch/*
    /Users/michaeldelarosa/opt/anaconda3/lib/python3.9/site-packages/torchgen/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m


In [None]:
# Training and testing loop
import matplotlib.pyplot as plt

train_losses = []
test_losses = []
test_accuracies = []

# Important for unbalanced data
classes = np.unique(labels)
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# Initialize the loss function with these weights
#criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
criterion = nn.CrossEntropyLoss()
past_injuries_transformer = TransformerModel_PI()
past_games_transformer = TransformerModel_PG()
mlp = MultilayerPerceptron(input_dim= 11)

parameters = list(past_injuries_transformer.parameters()) + list(past_games_transformer.parameters()) + list(mlp.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.001)

num_epochs = 20  # Define number of epochs
for epoch in range(num_epochs):
    past_injuries_transformer.train()
    past_games_transformer.train()
    mlp.train()
    total_loss = 0
    num_batches = 0

    
    for past_injuries, past_games, current_game_player, label in train_loader:
        past_injuries = ensure_2d(past_injuries)
        past_games = ensure_2d(past_games)
        current_game_player = ensure_2d(current_game_player)
        optimizer.zero_grad()
        past_injuries_encoded = past_injuries_transformer(past_injuries)
        past_games_encoded = past_games_transformer(past_games)

        combined_features = torch.cat((past_injuries_encoded, past_games_encoded, current_game_player), dim=1)
        outputs = mlp(combined_features)
        loss = criterion(outputs, label)
        if torch.isnan(loss):
            print("NaN detected in loss, stopping training")
            break
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(parameters, max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1
    train_losses.append(total_loss / num_batches)

    if torch.isnan(loss):
        print(f"Stopped at Epoch {epoch+1} due to NaN in loss")
        break
    # Testing phase
    past_injuries_transformer.eval()
    past_games_transformer.eval()
    mlp.eval()
    test_loss = 0
    correct = 0
    total = 0
    threshold = 0.8  # Define your custom threshold
    with torch.no_grad():
        probabilities = []
        predicted_labels = []
        for past_injuries, past_games, current_game_player, label in test_loader:
            past_injuries = ensure_2d(past_injuries)
            past_games = ensure_2d(past_games)
            current_game_player = ensure_2d(current_game_player)

            past_injuries_encoded = past_injuries_transformer(past_injuries)
            past_games_encoded = past_games_transformer(past_games)

            combined_features = torch.cat((past_injuries_encoded, past_games_encoded, current_game_player), dim=1)
            outputs = mlp(combined_features)
            probas = F.softmax(outputs, dim=1)  # Softmax across the class dimension
            probabilities.extend([tuple(proba.tolist()) for proba in probas])

            # Apply threshold to determine predicted class (for the positive class, usually index 1)
            predictions = (probas[:, 1] > threshold).long()
            predicted_labels.extend(predictions.tolist())

            total += label.size(0)
            correct += (predictions == label).sum().item()
            loss = criterion(outputs, label)
            test_loss += loss.item()


    test_losses.append(test_loss / len(test_loader))
    accuracy = 100 * correct / total
    test_accuracies.append(accuracy)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Test Accuracy: {accuracy:.2f}%')

: 

In [None]:
plt.figure(figsize=(12, 6))

# Plot training and test loss
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.title('Training and Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot test accuracy
plt.subplot(1, 2, 2)
plt.plot(test_accuracies, label='Test Accuracy')
plt.title('Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()

plt.show()

: 

In [None]:
print(predicted)
print(label)

NameError: name 'predicted' is not defined