In [None]:
import requests
import pandas as pd
import time
from lxml import html

In [23]:

def get_details(link):

    full_url = f'https://www.sportsbookreview.com{link}'
    try:
        response = requests.get(full_url)
        response.raise_for_status()
        tree = html.fromstring(response.content)

        # get full names
        away_names = tree.xpath('(//div[@class="d-flex justify-content-center pt-2"]//p)[1]/text()')
        away_full_name = ' '.join(away_names)
        home_names = tree.xpath('(//div[@class="d-flex justify-content-center pt-2"]//p)[2]/text()')
        home_full_name = ' '.join(home_names)
        arena = tree.xpath('//div[@class="fs-8 text-center py-2 border-top GameMatchup_GameRow___aNwo"]/text()')
        venue = ''.join(arena)

        return {'away_full_name': away_full_name, 'home_full_name': home_full_name, 'venue': venue}
        
    except Exception as e:
        print(f"Error scraping game details for {link}: {e}")
        return {'away_full_name': "N/A", 'home_full_name': "N/A", 'venue': "N/A"}

def get_odds(date):
    full_url = f'https://www.sportsbookreview.com/betting-odds/compare/money-line/full-game/?date={date}'

    try:
        response = requests.get(full_url)
        response.raise_for_status()
        tree = html.fromstring(response.content)

        games = []
        for l in ['nba', 'nhl', 'mlb', 'nfl']:
            if tree.xpath(f'//section[@id="section-{l}"]'):
                rows = tree.xpath(f'//div[@id="tbody-{l}"]/div')
                for row in rows:
                    if row:
                        # NHL has a different format
                        if l == 'nhl' or l == 'mlb':
                            away_team = row.xpath('.//a[@class="d-flex align-items-center overflow-hidden fs-9 GameRows_gradientContainer__ZajIf"]/b/text()')
                            home_team = row.xpath('.//a[@class="d-flex align-items-center overflow-hidden fs-9 GameRows_gradientContainer__ZajIf"]/b/text()')
                        else:
                            away_team = row.xpath('.//a[@class="d-flex align-items-center overflow-hidden fs-9 GameRows_gradientContainer__ZajIf"]/span/text()')
                            home_team = row.xpath('.//a[@class="d-flex align-items-center overflow-hidden fs-9 GameRows_gradientContainer__ZajIf"]/span/text()')
                        
                        scores = row.xpath('.//div[@class="d-flex flex-column flex-wrap justify-content-around align-items-center fs-9 fw-bold mb-n1 GameRows_scores__YkN24 "]/div/text()')
                        openers = row.xpath('.//span[@class="fs-9 undefined"]/text()')
                        mls = row.xpath('.//div[@class="d-flex align-items-center text-center OddsCells_oddsNumber__u3rsp OddsCells_compact__cawia"]/span/span[2]/text()')
                        link = row.xpath('.//a[@class="fs-9 py-2 pe-1 text-primary"]/@href')
                        details = get_details(link[0]) if link else {'away_full_name': "N/A", 'home_full_name': "N/A", 'venue': "N/A"}
                        away_full_name = details['away_full_name']
                        home_full_name = details['home_full_name']
                        venue = details['venue']

                        # make a dictionary for each item 
                        games.append({
                            'date': date,
                            'league': l,
                            'away_team': away_team[0] if away_team else "N/A",
                            'away_full_name': away_full_name,
                            'home_team': home_team[1] if home_team else "N/A",
                            'home_full_name': home_full_name,
                            'venue': venue,
                            'away_score': scores[0] if scores else "N/A",
                            'home_score': scores[1] if scores else "N/A",
                            'away_opener': openers[0] if openers else "N/A",
                            'home_opener': openers[1] if openers else "N/A",
                            'away_ml1': mls[0] if mls else "N/A",
                            'home_ml1': mls[1] if mls else "N/A",
                            'away_ml2': mls[2] if mls else "N/A",
                            'home_ml2': mls[3] if mls else "N/A",
                            'away_ml3': mls[4] if mls else "N/A",
                            'home_ml3': mls[5] if mls else "N/A",
                            'away_ml4': mls[6] if mls else "N/A",
                            'home_ml4': mls[7] if mls else "N/A",
                            'away_ml5': mls[8] if mls else "N/A",
                            'home_ml5': mls[9] if mls else "N/A",
                            'away_ml6': mls[10] if mls else "N/A",
                            'home_ml6': mls[11] if mls else "N/A"
                        })

        return pd.DataFrame(games)
    
    except Exception as e:
            print(f"Error scraping games for {date}: {e}")
            return pd.DataFrame()

In [None]:
# Generate the date range from 2008-01-01 to today minus one day
start_date = pd.to_datetime('2021-01-01')
end_date = pd.Timestamp.now() - pd.Timedelta(days=1)  # Yesterday

# Create a list of dates to iterate over
date_range = pd.date_range(start=start_date, end=end_date)

# List to store all the DataFrames
all_games = []

# Loop through the date range and call the get_odds function for each date
for date in date_range:
    date_str = date.strftime('%Y-%m-%d')  # Convert date to string in the format needed for the URL
    print(f"Scraping odds for {date_str}...")  # Optional: Print progress
    odds_data = get_odds(date_str)
    if not odds_data.empty:
        all_games.append(odds_data)


# Concatenate all DataFrames in the list
final_df = pd.concat(all_games, ignore_index=True)

In [28]:
final_df.to_csv('odds2021to2024.csv', index=False)