In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import timedelta, date
from time import sleep
import traceback

In [2]:
from dataclasses import dataclass
@dataclass
class Team:
    name: str
    abbrev: str
    wins: int = 0
    losses: int = 0
    elo: float = 1500

In [11]:
from datetime import datetime, timedelta
from math import log2, ceil
'''
    Get game results and elo from schedule
'''




# res = requests.get(SCOREBOARD_URL)
team_dict = {}
def collect_games(url):
    res = requests.get(url)
    espn_games_dict = res.json()

    def K_VALUE(team : Team):
        total_games = team.wins + team.losses
        
        if total_games < 10:
            return 60
        if 10 < total_games < 20:
            return 40
        return 20

    print(len(espn_games_dict['events']))
    for event in espn_games_dict['events']:
        try:
            away_team, home_team = re.split(r' (?:@|VS) ', event['shortName'])
            away_name, home_name = event['name'].split(" at ")
            
            if away_team not in team_dict: team_dict[away_team] = Team(away_name, away_team)
            if home_team not in team_dict: team_dict[home_team] = Team(home_name, home_team)
            
            # TODO create some kind of travel pentalty here
            # print(event['competitions'][0]['competitors'])
            score_home = int(event['competitions'][0]['competitors'][0]['winner'])
            score_away = int(not(score_home))
            
            q_home = 10**(team_dict[home_team].elo/400)
            q_away = 10**(team_dict[away_team].elo/400)
            
            e_home = q_home/(q_home+q_away)
            e_away = q_away/(q_home+q_away)
            
            team_dict[home_team].elo += K_VALUE(team_dict[home_team]) * (score_home - e_home)
            team_dict[away_team].elo += K_VALUE(team_dict[away_team]) * (score_away - e_away)
            
            if score_home == 1: # home team won
                team_dict[home_team].wins += 1
                team_dict[away_team].losses += 1
            else:
                team_dict[home_team].losses += 1
                team_dict[away_team].wins += 1
        except Exception as err:
            with open('err.log', 'a+') as log:
                print('error on url: ', url, file=log)
                print(str(err), file=log)
    

day = date(2023, 11, 6)
while day <= date(2024, 3, 17):
    collect_games(f'https://site.api.espn.com/apis/site/v2/sports/basketball/mens-college-basketball/scoreboard?groups=50&limit=1000&dates={day.strftime('%Y%m%d')}')
    day += timedelta(1)
    
# print(team_dict)
team_df = pd.DataFrame(team_dict.values())
team_df.elo = team_df.elo.round(1)
team_df.sort_values(by=['elo'], ascending=False, ignore_index=True).to_csv('ncaa_teams_elo.csv')


# SCHEDULE_URL = "https://www.espn.com/mens-college-basketball/schedule/_/date/20231106"


184
29
30
48
81
58
35
39
81
37
40
84
53
63
62
53
70
15
64
71
53
22
40
78
28
34
95
40
10
60
69
10
10
118
39
15
32
27
14
13
95
36
36
51
43
81
59
17
4
0
0
3
22
58
101
27
5
36
52
56
10
146
18
13
33
49
57
17
133
16
30
31
44
59
16
145
18
16
33
47
60
9
152
16
17
30
44
65
15
149
17
17
32
52
63
3
155
11
16
25
39
67
15
144
22
20
26
45
64
16
137
27
16
31
58
51
26
136
19
22
43
35
54
26
102
27
14
23
43
52
36
23
5


In [10]:
df = pd.read_csv('ncaa_teams_elo.csv')

     Unnamed: 0                                   name abbrev  wins  losses  \
0             0                        Houston Cougars    HOU    30       4   
1             1                          UConn Huskies   CONN    31       3   
2             2                    Purdue Boilermakers    PUR    29       4   
3             3                    James Madison Dukes    JMU    31       3   
4             4                    Iowa State Cyclones    ISU    27       7   
..          ...                                    ...    ...   ...     ...   
698         698                     Stonehill Skyhawks    STO     4      27   
699         699                           Siena Saints    SIE     4      28   
700         700                    Coppin State Eagles   COPP     2      27   
701         701                   Detroit Mercy Titans   DETM     1      31   
702         702  Mississippi Valley State Delta Devils   MVSU     1      30   

        elo  
0    1756.4  
1    1756.4  
2    1730

In [15]:
import random
from collections import defaultdict

@dataclass
class Match:
    left: Team
    right: Team
    winner: Team = None
    
    def get_winner(self):
        if not self.left and self.right:
            self.winner = self.right
        elif self.left and not self.right:
            self.winner = self.left
        else:
            assert self.left and self.right
            expected_score = (1+10**((self.right.elo - self.left.elo)/400))**-1 # expected chance that left will win
            if random.random() <= expected_score:
                self.winner = self.left
            else:
                self.winner = self.right
                
        return self.winner
    
    def __repr__(self):
        return f"{self.left.abbrev} vs {self.right.abbrev}"
    
class Tournament:
    def __init__(self, team_list, options = defaultdict(lambda: False)):
        assert len(team_list) > 1
        self._matches = []
        self._size = len(team_list)
        round = 1
        
        while len(team_list) > 1:
            if options['verbose']:
                print(f" -------------------------- ROUND {round} -------------------------- ")
            
            next_round = []
            for pair in zip(team_list[::2], team_list[1::2]):
                match = Match(*pair)
                next_round.append(match.get_winner())
                self._matches.append(match)
                if options['verbose']: print(f"{match} ==> {match.winner}")
            team_list = next_round
            round += 1
        self._winner = team_list[0]
        if options['verbose']:
            print(f"------------------------- WINNER: {self._winner} -------------------------")
        
        
    # def print_rounds(self):
    #     round = 1
    #     q = self._matches[:]
    #     while len(q) > 1:
            
    #         curr_size = len(q)
    #         for _ in range(curr_size//2):
    #             match = q.pop(0)
    #             print(f"{match.left} vs {match.right} ==> {match.winner}")
    #         round += 1
            
def monte_carlo_simulator(n, conferences):
    # assert len(conferences) > 1
    n_rounds = int(log2(len(conferences[0])))
    print(len(conferences[0]))
    print(n_rounds)
    freq_list = { team.abbrev : [0 for _ in range(n_rounds+1)] for teams in conferences for team in teams }
    
    for _ in range(n):
        tourney = Tournament(conferences[0])
        
        q = tourney._matches[:]
        
        round = 0
        for round in range(n_rounds+1):
            curr_size = ceil(len(q)/2)
            for _ in range(curr_size):
                match = q.pop(0)
                freq_list[match.left.abbrev][round] += 1
                freq_list[match.right.abbrev][round] += 1
        freq_list[tourney._winner.abbrev][-1] += 1 
    return {k: list(cnt/n*100 for cnt in v) for k,v in freq_list.items()}
    
    
tournament_entries = [
    'PUR', 'MTST', 'USU', 'TCU',
    'GONZ', 'MCN', 'UK', 'SAM', 
    'SC', 'ORE', 'CREI', 'AKR', 
    'TEX', 'CSU', 'TENN', 'SPU', # NDSU -> SO DAKOTA ST
    
    'HOU', 'LONG', 'NEB', 'TA&M', # MORG -> WAG/HOWARD
    'WIS', 'JMU', 'DUKE', 'UVM', # FCU 
    'TTU', 'NCSU', 'UK', 'OAK', # INST -> DRAKE
    'FLA', 'BOIS', 'MARQ', 'WKU', # CHSO -> LONGWOOD
    
    'CONN', 'STET', 'FAU', 'NU',
    'SDSU', 'UAB',  'AUB', 'YALE', # COR -> PRINCETON
    'BYU', 'DUQ', 'ILL', 'MORE',
    'WSU', 'DRKE', 'ISU', 'SDST', 
    
    'UNC', 'HOW', 'MSST', 'MSU', # UCI -> UC DAVIS
    'SMC', 'GCU', 'ALA', 'COFC',
    'CLEM', 'UNM', 'BAY', 'COLG', # CMU -> AKRON
    'DAY', 'NEV', 'ARIZ', 'LBSU' # LIB -> WKU
    
]    
    
    
freq = monte_carlo_simulator(100_000, [list(team_dict[team] for team in tournament_entries)])
n_rounds = len(list(freq.values())[0])
print(n_rounds)
df = team_df[team_df['abbrev'].isin(freq.keys())].sort_values(by=["abbrev"])
for i in range(1, n_rounds+1):
    s = f'round {i}'
    df[f'round {i}'] = 0.0

for k,v in freq.items():
    for r in range(n_rounds):
        df.loc[df['abbrev'] == k, f'round {r+1}'] = round(v[r], 1)

df = df.rename(columns={'round 3': 'sweet 16', 'round 4': 'elite eight', 'round 5': 'final 4', 'round 6': 'championship', 'round 7': 'win'})
df.sort_values(by=[f'win'], ascending=False).to_csv('march_madness_2024_odds.csv', index=False)
# final_four = Tournament()
# final_four.print_rounds()

64
6
7


In [12]:
df = pd.read_csv('march_madness_2024_odds.csv')
print(df.sort_values(by=[f'sweet 16'], ascending=False))

                          name abbrev  wins  losses     elo  round 1  round 2  \
0          Purdue Boilermakers    PUR    28       3  1711.5    100.0     78.8   
2                UConn Huskies   CONN    29       3  1715.5    100.0     79.2   
1              Houston Cougars    HOU    29       3  1728.5    100.0     78.7   
3     North Carolina Tar Heels    UNC    26       6  1663.2    100.0     75.4   
4             Arizona Wildcats   ARIZ    25       7  1651.7    100.0     72.4   
..                         ...    ...   ...     ...     ...      ...      ...   
50             Colgate Raiders   COLG     0       2  1470.7    100.0     30.0   
60  Mississippi State Bulldogs   MSST     3       7  1450.2    100.0     38.4   
56     Indiana State Sycamores   INST     0       3  1442.6    100.0     27.9   
53             Stetson Hatters   STET     0       1  1482.2    100.0     20.8   
49         UC Irvine Anteaters    UCI     1       1  1499.7    100.0     21.3   

    sweet 16  elite eight  

In [None]:

# # computer 
# bpi_tables = pd.read_html("https://www.espn.com/mens-college-basketball/bpi")
# # print(bpi_tables[0][''])
# bpi_tables[1]['Team'] = bpi_tables[0]['Team']
# bpi_tables[1]['CONF'] = bpi_tables[0]['CONF']

# bpi_tables[1].to_csv('bpi.csv')
# # human rankings
# # sagarin_table = pd.read_html("http://www.usatoday.com/sports/ncaab/sagarin/")
# # ncaa_table = pd.read_html("")


In [None]:
# # from bs4 import BeautifulSoup

# pom_table = pd.read_html("pomeroy.html")
# print(pom_table.head(20))

In [None]:
# lrmc_table = pd.read_html("lrmc.html")
# print(lrmc_table)

In [None]:
# moore_table = pd.read_csv("moore.csv")
# print(moore_table)