In [8]:
#16 cells

#basics for python
import numpy as np
import pandas as pd
from datetime import date, timedelta

#machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#model evaluation
from sklearn.metrics import confusion_matrix

#html processing
import requests as rq
from bs4 import BeautifulSoup as bs
import re #regular expressions
import calendar as cal
from datetime import date, datetime

In [9]:
# initializing date list
start_date = date(2024, 5, 14)
yesterday = date.today() - timedelta(days=1)
delta = (date.today() - start_date) #using "today" to make the dates work out right
season_end = '20240919'

if int(yesterday.strftime('%Y%m%d')) < int(season_end):
    date_generated = pd.date_range(start_date, periods=delta.days, freq = '1D')
    string_dates = date_generated.strftime('%Y%m%d')

#regular season ends september 29
print('string dates =', string_dates)

string dates = Index(['20240514', '20240515', '20240516', '20240517', '20240518', '20240519',
       '20240520', '20240521', '20240522', '20240523', '20240524', '20240525',
       '20240526', '20240527', '20240528', '20240529', '20240530', '20240531',
       '20240601', '20240602', '20240603'],
      dtype='object')


In [10]:
#Set up the DF for the scraped data
game_cols = ['Year'
             , 'Month'
             , 'Day of Month'
             , 'Day of Week'
             #game time could be useful, but I'll only have it if I have been tracking since the beginning
             , 'Home Team'
             , 'Away Team'
             , 'Home team home wins'
             , 'Home team away wins'
             , 'Home team home losses'
             , 'Home team away losses'
             , 'Away team home wins'
             , 'Away team away wins'
             , 'Away team home losses'
             , 'Away team away losses'  
             , 'Home team overall wins'
             , 'Home team overall losses'
             , 'Away team overall wins'
             , 'Away team overall losses'
             #other stats
             #haha team ov/hi/aw points/rebounds/baskets/steals
             , 'Home_Win_Flag' #This is a binary. I don't need an "away win" flag.
            ]

game_data = pd.DataFrame(columns = game_cols)

In [11]:
print(game_data.shape)

(0, 19)


In [12]:
#initialize everything
records_dict = {}
outer_break = False
index = 0 #where to start adding to the DF

#initialize the records
for sub in ['overall_records', 'home_records', 'away_records']:
    records_dict[sub] = {}

In [13]:
#month conversion
    #counts starting at Janary = 1
month_name_to_num = {name: num for num, name in enumerate(cal.month_name) if num}
print(month_name_to_num)

#day_of_week_conversion
    #counts starting at Sunday = 1
day_name_to_num = {name: ((num+1) % 7)+1 for num, name in enumerate(cal.day_name)}
print(day_name_to_num)

{'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}
{'Monday': 2, 'Tuesday': 3, 'Wednesday': 4, 'Thursday': 5, 'Friday': 6, 'Saturday': 7, 'Sunday': 1}


In [14]:
#SCRAPE THE DATA
#also, could include record against opposing team
    #wins vs opponent
    #losses vs opponent
    #points
    #rebounds
    #steals
    #baskets
    #possession percentage
outer_break = False
for day in string_dates:#['20240525']
    
    if outer_break: #this is set to True if the game has not been played yet.
        break #don't want to keep trying to process if no more games
        
    test_url = "https://www.espn.com/wnba/schedule/_/date/" + day

    #set the header so that espn doesn't reject me directly
    header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"} #have to do this or else ESPN will block me for being a python script and not a user.
    request = rq.get(test_url, headers = header) #assign my request to a variable

    #get the html of the page
    html_text = bs(request.text, 'html') #See the HTML code of the whole page

    #get the individual responsive tables on the page. These are many. I only want the first one.
    responsive_table = html_text.find('div', class_ = "ResponsiveTable")
    table_date = responsive_table.find('div', class_ = "Table__Title").text
    day_of_week_name, nothing1, month_name, day_num, nothing2, year_num, nothing3 = re.split(r'[,| ]', table_date)
    day_of_week_num = day_name_to_num[day_of_week_name] #convert string name to number in week
    month_of_year_num = month_name_to_num[month_name] #convert string name to number in week
    games = responsive_table.find_all('tr', class_ = 'Table__TR Table__TR--sm Table__even')
    
    for game in games:
        #Get away team
        away_cell = game.find('td', class_ = 'events__col Table__TD')
        away_team = away_cell.find('span', {'class': 'Table__Team away'})
        away_team_tag = away_team.find('a', class_ = 'AnchorLink', tabindex = '0', href = re.compile('/wnba/team/_/name/'))
        away_team_tag_href = away_team_tag.get('href')
        split = re.split('/', away_team_tag_href)
        away_team = split[5].upper()

        #Get home team
        home_cell = game.find('td', class_ = 'colspan__col Table__TD')
        home_team = home_cell.find('span', {'class': 'Table__Team'})
        home_team_tag = home_team.find('a', class_ = 'AnchorLink', tabindex = '0', href = re.compile('/wnba/team/_/name/'))
        home_team_tag_href = home_team_tag.get('href')
        split = re.split('/', home_team_tag_href)
        home_team = split[5].upper()

#this part fails if no games happen on a day
        #Parse the box score
        game_results = game.find_all('td', class_ = 'teams__col Table__TD')
        try:
            score_line = game_results[0].find('a', class_ = 'AnchorLink').text
        except IndexError: #there no games to create scores
            outer_break = True
            break
        if len(re.split(r'[,| ]', score_line)) == 5:
            win_team, win_score, nothing1, lose_team, lose_score = re.split(r'[,| ]', score_line)
        
        #if the team doesn't exist in the record dicts then create it
        for sub in ['overall_records', 'home_records', 'away_records']:
            if home_team not in records_dict[sub].keys():
                records_dict[sub][home_team] = [0,0]
            if away_team not in records_dict[sub].keys():
                records_dict[sub][away_team] = [0,0]

        #i think that knowing home vs away winner will be important to the analysis
        home_win_flag = 0 #default
        #if the home team wins then change the flag to 1
        if win_team == home_team:
            home_win_flag = 1
            records_dict['overall_records'][home_team][0] += 1 #home team registers a win
            records_dict['home_records'][home_team][0] += 1 #home team registers a home win
            records_dict['overall_records'][away_team][1] += 1 #away team registers a loss
            records_dict['away_records'][away_team][1] += 1 #away team registers an away loss
        elif win_team == away_team:
            records_dict['overall_records'][home_team][1] += 1 #home team registers a loss
            records_dict['home_records'][home_team][1] += 1 #home team registers a home loss
            records_dict['overall_records'][away_team][0] += 1 #away team registers a win
            records_dict['away_records'][away_team][0] += 1 #away team registers an away win
        else:
            continue

#######################                #append to the DF of the games

        num_rows = game_data.shape[0]
        game_data.loc[(num_rows + 1)] = [year_num
            , month_of_year_num
            , day_num                             
            , day_of_week_num
            , home_team
            , away_team
            , records_dict['home_records'][home_team][0] #home team home wins
            , records_dict['away_records'][home_team][0] #Home team away wins
            , records_dict['home_records'][home_team][1] #Home team home losses
            , records_dict['away_records'][home_team][1] #Home team away losses
            , records_dict['home_records'][away_team][0] #Away team home wins
            , records_dict['away_records'][away_team][0] #Away team away wins
            , records_dict['home_records'][away_team][1] #Away team home losses
            , records_dict['away_records'][away_team][1] #Away team away losses
            , records_dict['overall_records'][home_team][0] #Home team overall wins
            , records_dict['overall_records'][home_team][1] #Home team overall losses
            , records_dict['overall_records'][away_team][0] #Away team overall wins
            , records_dict['overall_records'][away_team][1] #Away team overall losses
            , home_win_flag
        ]
        if game_data.shape[0] % 20 == 0:
            print('game data shape =', game_data.shape)
        outer_break = False
print('game data shape =', game_data.shape)
print('game data df complete')

game data shape = (20, 19)
game data shape = (40, 19)
game data shape = (49, 19)
game data df complete


In [15]:
#gotta ohe the home and away team names
game_data_ohe = pd.get_dummies(data = game_data, columns = ['Home Team', 'Away Team'])
print(game_data_ohe.shape)
print(game_data_ohe.tail(10))

(49, 41)
    Year  Month Day of Month  Day of Week  Home team home wins  \
40  2024      5           30            5                    1   
41  2024      5           31            6                    2   
42  2024      5           31            6                    5   
43  2024      5           31            6                    4   
44  2024      5           31            6                    2   
45  2024      6            1            7                    1   
46  2024      6            2            1                    2   
47  2024      6            2            1                    3   
48  2024      6            2            1                    5   
49  2024      6            2            1                    3   

    Home team away wins  Home team home losses  Home team away losses  \
40                    2                      3                      1   
41                    2                      1                      1   
42                    2                      

In [16]:
#Here is where I need to start feeding in predictions.
#only after I have expanded to the full OHE

def record_for_pred(away_team, home_team, date_string = date.today()):
    string_date = date_string.strftime('%Y%m%d')
    pred_dict = {
        'Home Team_' + home_team: 1 #flag
        , 'Away Team_' + away_team: 1 #flag
        , 'Year': int(string_date[:4]) #first four digits
        , 'Month': int(string_date[4:6]) #next two digits
        , 'Day of Month': int(string_date[6:8])#next two digits
        , 'Day of Week': day_name_to_num[date_string.strftime('%A')]
        , 'Home team home wins': records_dict['home_records'][home_team][0]
        , 'Home team away wins': records_dict['away_records'][home_team][0]
        , 'Home team home losses': records_dict['home_records'][home_team][1]
        , 'Home team away losses': records_dict['away_records'][home_team][1]
        , 'Away team home wins': records_dict['home_records'][away_team][0]
        , 'Away team away wins': records_dict['away_records'][away_team][0]
        , 'Away team home losses': records_dict['home_records'][away_team][1]
        , 'Away team away losses': records_dict['away_records'][away_team][1]
        , 'Home team overall wins': records_dict['overall_records'][home_team][0]
        , 'Home team overall losses': records_dict['overall_records'][home_team][1]
        , 'Away team overall wins': records_dict['overall_records'][away_team][0]
        , 'Away team overall losses': records_dict['overall_records'][away_team][1]
    }
    return pred_dict

In [17]:
#Set up the input games for prediction
df_for_preds = pd.DataFrame(columns = game_data_ohe.columns)
games_list = [
    ['WSH', 'CONN']
    ,['NY', 'CHI']
    ,['PHX', 'SEA']
]

#generate the recordsot insert for predictions
for i, game in enumerate(games_list):

    #parse the incoming list
    away_team = game[0]
    home_team = game[1]

    #try to use the function
    try:
        new_record = record_for_pred(away_team, home_team)
    #if it fails then an OHE-generated column is missing
    #so create column
    except:
        for thing in [away_team, home_team]:
            for rec_type in ['overall_records', 'home_records', 'away_records']:
                if thing not in records_dict[rec_type].keys():
                    print('not in records =', thing)
                    records_dict[rec_type][thing] = [0, 0]
        new_record = record_for_pred(away_team, home_team)
    df_for_preds = pd.concat([df_for_preds, pd.DataFrame(new_record, index = [i])])
       
df_for_preds = df_for_preds.astype(float).fillna(0).astype(int).drop('Home_Win_Flag', axis = 1)
print(df_for_preds)

   Year  Month  Day of Month  Day of Week  Home team home wins  \
0  2024      6             4            3                    5   
1  2024      6             4            3                    1   
2  2024      6             4            3                    2   

   Home team away wins  Home team home losses  Home team away losses  \
0                    3                      0                      0   
1                    2                      3                      2   
2                    4                      1                      1   

   Away team home wins  Away team away wins  ...  Away Team_CONN  \
0                    0                    0  ...               0   
1                    5                    2  ...               0   
2                    3                    1  ...               0   

   Away Team_DAL  Away Team_IND  Away Team_LA  Away Team_LV  Away Team_MIN  \
0              0              0             0             0              0   
1              0 

In [18]:
a = game_data_ohe.copy().columns
b = df_for_preds.copy().columns
print(len(a))
print(len(b))
df_for_preds[a.difference(b)] = 0 #add the missing columns
df_for_preds = df_for_preds.copy()
game_data_ohe[b.difference(a)] = 0 #add the missing columns
game_data_ohe = game_data_ohe.copy()
print(game_data_ohe.shape) #will eventually split off the home win flag column
print(df_for_preds.shape) #have to remove the home win flag column to do predictions

41
40
(49, 41)
(3, 41)


In [19]:
#set up the training data, the training labels, and the data to use for predictions
x_train = game_data_ohe.drop('Home_Win_Flag', axis = 1)
print(x_train.shape)
y_train = game_data_ohe['Home_Win_Flag']
new_df_for_preds = df_for_preds.drop('Home_Win_Flag', axis = 1)
print(new_df_for_preds.shape)

#x_test = game_data_ohe[-60:-30].drop('Home_Win_Flag', axis = 1)
#y_test = game_data_ohe[-60:-30]['Home_Win_Flag']

(49, 40)
(3, 40)


In [20]:
#random forest classifier
mlb_rf1 = RandomForestClassifier(n_estimators = 200, n_jobs = -2, random_state = 42) #no hyperparameter testing or tuning
mlb_rf1.fit(x_train, y_train)

In [21]:
#check that all the columns are in the same order
print((x_train.columns == new_df_for_preds.columns).sum())
new_df_for_preds = new_df_for_preds[x_train.columns]
print((x_train.columns == new_df_for_preds.columns).sum())

40
40


In [22]:
y_pred = mlb_rf1.predict(new_df_for_preds)
#conf_mat = confusion_matrix(y_test, y_pred)
#print(conf_mat)
#print(mlb_rf1.score(x_test, y_test))
print(y_pred)
#75 to 80 percent accuracy on default Random Forest

[1 0 1]


In [23]:
for i, game in enumerate(games_list):
    #can just do value
    print(game[y_pred[i]]) #use i in y_pred to find Home_Win_Flag prediction.

CONN
NY
SEA
