In [324]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.remote.remote_connection import RemoteConnection
from selenium.common.exceptions import TimeoutException

import random
import time

In [327]:
def american_odds_to_probability(american_odds):
    if american_odds == '(EVEN)':
        return 0.5
    else:
        odds = float(american_odds.replace('(', '').replace(')', ''))
        if odds > 0:
            return 100 / (odds + 100)
        return abs(odds) / (abs(odds) + 100)
    
def get_player_and_odds_data(url):

    # # Fetching the webpage content with custom headers
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    # }
    # response = requests.get(url, headers=headers)

    # # Check for HTTP status code issues
    # if response.status_code == 403:
    #     return "403 Forbidden", []
    # elif response.status_code != 200:
    #     return f"Error {response.status_code}", []

    # # Parsing the content with BeautifulSoup
    # soup = BeautifulSoup(response.content, 'html.parser')


    # Fetching the webpage content
    options = Options()
    options.headless = True
    service = Service('/Users/williamfoote/Downloads/chromedriver-mac-arm64/chromedriver')
    browser = webdriver.Chrome(service=service, options=options)
    browser.get(url)
    wait = WebDriverWait(browser, 10)  # Wait for up to 10 seconds
    # Wait until the elements have loaded that we need
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'odds-offer')))

    # Create loop to scroll to the last element on the page, 
    # Break the loop if the no new elements are found
    while True:
        try:
            # Find the last 'odds-offer' element
            elements = browser.find_elements(By.CLASS_NAME, "odds-offer")
            last_element = elements[-1] if elements else None

            if not last_element:
                break  # Break if no elements are found

            # Scroll to the last element
            browser.execute_script("arguments[0].scrollIntoView(true);", last_element)

            # Wait for new elements to load after scrolling
            WebDriverWait(browser, 15).until(
                lambda driver: len(driver.find_elements(By.CLASS_NAME, "odds-offer")) > len(elements)
            )
        except TimeoutException:
            # Break the loop if no new elements are found after the wait
            break
    # Create BeautifulSoup object after each scroll
    html = browser.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    browser.quit()
    # Extracting player data and odds

    player_data = []
    # Odds offer is parent class
    odds_offers = soup.find_all("div", class_="odds-offer")
    date = datetime.strptime(url.split('date=')[-1], '%Y-%m-%d').date()
    for offer in odds_offers:
        # Finding player information
        player_info = offer.find("div", class_="odds-player__info odds-player__info--large")
        if player_info is None:
            continue
        player_link = player_info.find("a", class_="link odds-player__heading odds-player__heading--large")['href']

        # Extract player position and matchup
        player_position = player_info.find("p", class_="typography odds-player__subheading").text
        team, position = player_position.split(' - ')
        if position not in ['SP', 'RP', 'RP,SP', 'SP,RP', 'DH,SP']:
            continue
        player_matchup = player_info.find("div", class_="odds-player__matchup-tag").text
        # Extracting the player's full name from the link
        player_name = player_link.split('/')[-2].replace('-', ' ').title()
        odds_container = offer.find(class_='odds-offer__item odds-offer__item--best-odds')
        lines = odds_container.find_all(class_='typography odds-cell__line')
        costs = odds_container.find_all(class_='typography odds-cell__cost')

        over_line, under_line, over_cost_USA, under_cost_USA = None, None, None, None

        for line, cost in zip(lines, costs):
            if "O " in line.text:
                over_line = float(line.text.replace("O ", ""))
                over_cost_USA = cost.text
            elif "U " in line.text:
                under_line = float(line.text.replace("U ", ""))
                under_cost_USA = cost.text

        # Convert American odds to probability if they are available
        over_cost = american_odds_to_probability(over_cost_USA) if over_cost_USA else None
        under_cost = american_odds_to_probability(under_cost_USA) if under_cost_USA else None

        # Check if over_line and under_line are not None before proceeding
        if over_line is not None and under_line is not None:
        # Adding the extracted data to the player data list
            player_data.append({
                "name": player_name,
                "team": team,
                "position": position,
                "matchup": player_matchup,
                "over_line": over_line,
                "under_line": under_line,
                "over_cost": over_cost,
                "under_cost": under_cost,
                "over_cost_USA": over_cost_USA,
                "under_cost_USA": under_cost_USA,
                "date": date
            })
    return player_data

In [348]:
import pandas as pd
from datetime import datetime, timedelta

start_date = datetime(2023, 3, 30)  # Start of the season
end_date = datetime(2023, 11, 1)   # End of the season
delta = timedelta(days=1)

all_data = []

while start_date <= end_date:
    date_str = start_date.strftime('%Y-%m-%d')
    url = f"https://www.bettingpros.com/mlb/odds/player-props/strikeouts/?date={date_str}"
    try:
        daily_data = get_player_and_odds_data(url)
        if daily_data:  # Check if data is not empty
            all_data.extend(daily_data)
            print(f"Successfully scraped data for {date_str}")
    except Exception as e:
        print(f"No data on {date_str}")
    finally:
        start_date += delta
        sleep_time = random.uniform(5, 10)  # Random sleep time between 5 and 10 seconds
        time.sleep(sleep_time)

season_df = pd.DataFrame(all_data)


Successfully scraped data for 2023-03-30
Successfully scraped data for 2023-03-31
Successfully scraped data for 2023-04-01
Successfully scraped data for 2023-04-02
Successfully scraped data for 2023-04-03
Successfully scraped data for 2023-04-04
Successfully scraped data for 2023-04-05
Successfully scraped data for 2023-04-06
Successfully scraped data for 2023-04-07
Successfully scraped data for 2023-04-08
Successfully scraped data for 2023-04-09
Successfully scraped data for 2023-04-10
Successfully scraped data for 2023-04-11
Successfully scraped data for 2023-04-12
Successfully scraped data for 2023-04-13
Successfully scraped data for 2023-04-14
Successfully scraped data for 2023-04-15
Successfully scraped data for 2023-04-16
Successfully scraped data for 2023-04-17
Successfully scraped data for 2023-04-18
Successfully scraped data for 2023-04-19
Successfully scraped data for 2023-04-20
Successfully scraped data for 2023-04-21
Successfully scraped data for 2023-04-22
Successfully scr

In [332]:
may = season_df
may['position'].unique()

array(['SP', 'SP,RP', 'RP,SP', 'RP', 'DH,SP'], dtype=object)

In [354]:
export2.to_csv('/Users/williamfoote/Documents/GitHub/bovada_scraping/data/2023/all_2023.csv')
# may_21.to_csv('/Users/williamfoote/Documents/GitHub/bovada_scraping/data/2021/may.csv')

# Debugging Code

# Create a test URL or insert the URL that is messing up
Identify the error and adjust code above to fix the issue

In [351]:
export2 = pd.concat([season_df, df], ignore_index=True, axis=0)

In [349]:

test_url = 'https://www.bettingpros.com/mlb/odds/player-props/strikeouts/?date=2022-07-06'
data = get_player_and_odds_data(test_url)
df = pd.DataFrame(data)
df

Unnamed: 0,name,team,position,matchup,over_line,under_line,over_cost,under_cost,over_cost_USA,under_cost_USA,date
0,Shane Bieber,CLE,SP,CLE at DET,6.5,6.5,0.583333,0.454545,(-140),(+120),2022-07-06
1,Michael Pineda,FA,SP,CLE at DET,2.5,2.5,0.565217,0.460829,(-130),(+117),2022-07-06
2,Corbin Burnes,BAL,SP,CHC at MIL,8.5,8.5,0.5,0.547511,(EVEN),(-121),2022-07-06
3,Adrian Sampson,FA,SP,CHC at MIL,3.5,4.5,0.625468,0.609375,(-167),(-156),2022-07-06
4,Lance Lynn,STL,SP,MIN at CWS,5.5,5.5,0.534884,0.52381,(-115),(-110),2022-07-06
5,Joe Ryan,MIN,SP,MIN at CWS,4.5,5.5,0.603175,0.607843,(-152),(-155),2022-07-06
6,Jose Berrios,TOR,SP,TOR at OAK,4.5,4.5,0.576271,0.47619,(-136),(+110),2022-07-06
7,James Kaprielian,FA,SP,TOR at OAK,3.5,3.5,0.603175,0.438596,(-152),(+128),2022-07-06
8,Shohei Ohtani,LAD,"DH,SP",LAA at MIA,7.5,7.5,0.454545,0.603175,(+120),(-152),2022-07-06
9,Trevor Rogers,MIA,SP,LAA at MIA,4.5,5.5,0.618321,0.557522,(-162),(-126),2022-07-06


## If needed, edit the code below to get the css items that are missing or incorrectly accessed

In [300]:
# Fetching the webpage content
options = Options()
options.headless = True
service = Service('/Users/williamfoote/Downloads/chromedriver-mac-arm64/chromedriver')
browser = webdriver.Chrome(service=service, options=options)
browser.get(test_url)
wait = WebDriverWait(browser, 10)  # Wait for up to 10 seconds


# Wait until the elements have loaded that we need
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'odds-offer')))
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
browser.quit()
# Extracting player data and odds



player_data = []
# Odds offer is parent class
odds_offers = soup.find_all("div", class_="grouped-items-with-sticky-footer__content")

In [308]:
odds_offers = soup.find_all("div", class_="grouped-items-with-sticky-footer__content")
len(odds_offers)

21

In [304]:
len(soup.find_all(By.CLASS_NAME, "grouped-items-with-sticky-footer__content"))


0

In [257]:
odds_container = odds_offers[3].find(class_='odds-offer__item odds-offer__item--best-odds')

In [247]:
odds_container.find_all(class_='odds-cell odds-cell--best odds-cell--event-completed')[0].text

'+1.0(+1100)'

In [265]:
player_info = offer.find("div", class_="odds-player__info odds-player__info--large")
player_position = player_info.find("p", class_="typography odds-player__subheading").text

# if player_info is None:
#     continue
# player_link = player_info.find("a", class_="link odds-player__heading odds-player__heading--large")['href']

# # Extracting the player's full name from the link
# player_name = player_link.split('/')[-2].replace('-', ' ').title()

# # Extract player position and matchup
# player_position = player_info.find("p", class_="typography odds-player__subheading").text
# if player_position not in ['SP', 'RP', 'RP,SP', 'SP,RP', 'DH,SP']:
#     continue
# player_matchup = player_info.find("div", class_="odds-player__matchup-tag").text
player_position

'PHI - SP'

In [249]:
player_info

<div class="odds-player__info odds-player__info--large" data-v-e8516d98=""><div class="odds-player__matchup-tag" data-v-e8516d98="">MIA at NYM</div><a class="link odds-player__heading odds-player__heading--large" data-v-e8516d98="" href="/mlb/odds/player-props/taijuan-walker/" style="--ef09f0fa: #16191D; --fe392e3c: block; --bf535660: 1.8rem; --2ab080f2: 700;" target="_self">T. Walker</a><p class="typography odds-player__subheading" data-v-e8516d98="" data-v-f8c6c06d="" style="--64dcb17e: #525A67; --0b8a2e86: 1.2rem; --584c03bd: 600; --6553e651: 170%; --faa089fc: left; --97ef6e2e: none; --da5e331a: nowrap;">PHI - SP</p></div>

In [227]:
# Finding player information
player_info = odds_offers[4].find("div", class_="odds-player__info odds-player__info--large")
player_link = player_info.find("a", class_="link odds-player__heading odds-player__heading--large")['href']

# Extracting the player's full name from the link
player_name = player_link.split('/')[-2].replace('-', ' ').title()


AttributeError: 'NoneType' object has no attribute 'find'