# **World Baseball Classic Predictions**

## *Import Libraries*

In [1]:
#import libraries
import os 
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import calendar

## Create a dir

In [2]:
# Create a directory to store the data
if not os.path.exists('WBC_2023'):
    os.makedirs('WBC_2023')


## I am defining the years in a var

In [3]:
# Define the years of interest
years = [2006, 2009, 2013, 2017, 2023]

## create a for loop to scrape teh data

In [19]:


# Loop through each year and scrape the data
for year in years:
    print(f"Scraping data for {year}...")
    # Make a request to the Wikipedia page
    url = f"https://en.wikipedia.org/wiki/{year}_World_Baseball_Classic"
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the table containing the matchups
    table = soup.find("table", class_="wikitable")

    # Loop through each row in the table and extract the final score
    matchups = []
    for tr in table.find_all("tr")[1:]:
        # Extract the data from each cell in the row
        tds = tr.find_all("td")
        if len(tds) > 3:
            matchup = {
                "date": tds[0].text.strip(),
                "venue": tds[1].text.strip(),
                "matchup": tds[2].text.strip(),
                "score": tds[3].text.strip(),
            }
            matchups.append(matchup)
   

Scraping data for 2006...
Scraping data for 2009...
Scraping data for 2013...
Scraping data for 2017...
Scraping data for 2023...


In [7]:
    url = f"https://en.wikipedia.org/wiki/2023_World_Baseball_Classic"
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the table containing the matchups
    table = soup.find("table", class_="wikitable")

    # Loop through each row in the table and extract the final score
    matchups = []
    for tr in table.find_all("tr")[1:]:
        # Extract the data from each cell in the row
        tds = tr.find_all("td")
        if len(tds) > 3:
            matchup = {
                "date": tds[0].text.strip(),
                "venue": tds[1].text.strip(),
                "matchup": tds[2].text.strip(),
                "score": tds[3].text.strip(),
            }
            matchups.append(matchup)
   

## save to CSV files

In [8]:
 # Save the matchups data to a CSV file
matchups_df = pd.DataFrame(matchups)
matchups_df.to_csv(f'WBC_data/2023_matchups.csv', index=False)

## find pool composition

In [10]:
  # Find the table containing the pool composition
table = soup.find("table", class_="wikitable")

## extract pool composition

In [13]:
 # Extract the pool composition
pools = {}
for tr in table.find_all("tr")[1:]:
        # Extract the data from each cell in the row
        tds = tr.find_all("td")
        pool = tds[0].text.strip()
        teams = [team.strip() for team in tds[1].text.strip().split("\n")]
        pools[pool] = teams

## save the pool composition to the csv

In [16]:
 # Save the pool composition data to a CSV file
pools_df = pd.DataFrame(pools.items(), columns=["pool", "teams"])
pools_df.to_csv(f'WBC_2023/2023_pools.csv', index=False)

    

# Player information

In [18]:
# Define the URL to scrape the Teams Roster for the (2006)
url06 = "https://www.baseball-reference.com/bullpen/2006_World_Baseball_Classic_(Rosters)"

# Make a GET resquest
response = requests.get(url06) #note you do not have to use the url06 variable you can put the link directly
soup = BeautifulSoup(response.content, 'html.parser')

soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>2006 World Baseball Classic (Rosters) - BR Bullpen</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":true,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"ZAgzEsGCIVpgBB6a2ET2RwAAAAs","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"2006_World_Baseball_Classic_(Rosters)","wgTitle":"2006 World Baseball Classic (Rosters)","wgCurRevisionId":1269881,"wgRevisionId":1269881,"wgArticleId":43872,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["World Baseball Classic","2006","Rosters"],"wgPageContentLanguage":"en","wgPageContentModel":"wiki

In [10]:
# find the section for Australia
australia_section = soup.find('span', {'id': 'Australia'}).parent

# find the table for Australia
australia_table = australia_section.find('table')

# extract the contents of the table
for row in australia_table.find_all('tr'):
    # do something with the row data here
    print(row.text)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [11]:
# find all the tables containing player information
table_list = soup.find_all('table', id_='Australia')

# create an empty dictionary to store the teams' data
teams_data = {}

In [12]:

# loop through each table
for table in table_list:
    # get the team name from the table header
    team_name = table.find_previous('h3').text.strip()
    
    # extract the players' information from the table and store it in a pandas DataFrame
    table_data = pd.read_html(str(table))[0]
    
    # remove any rows containing "nan" values
    table_data.dropna(inplace=True)
    
    # add the team data to the dictionary
    teams_data[team_name] = table_data
    
    

In [14]:
teams_data

{}

In [16]:

# create a dictionary of pool names and their corresponding folders
pool_folders = {
    'Pool_A': 'Desktop/Divergence/DATA ANALYSIS/Notebooks/WBC_2006/Pool_A',
    'Pool_B': 'Desktop/Divergence/DATA ANALYSIS/Notebooks/WBC_2006/Pool_B',
    'Pool_C': 'Desktop/Divergence/DATA ANALYSIS/Notebooks/WBC_2006/Pool_C',
    'Pool_D': 'Desktop/Divergence/DATA ANALYSIS/Notebooks/WBC_2006/Pool_D'
}

# loop through each team and write the data to CSV in the appropriate folder
for team_name, team_data in teams_data.items():
    # get the pool name from the team data
    pool_name = team_data.iloc[0]['Pool']
    
    # get the folder path for the pool name
    pool_folder = pool_folders[pool_name]
    
    # create the team folder if it doesn't already exist
    team_folder = os.path.join(pool_folder, team_name)
    os.makedirs(team_folder, exist_ok=True)
    
    # write the data to CSV
    csv_path = os.path.join(team_folder, f"{team_name}.csv")
    team_data.to_csv(csv_path, index=False)


## try 2


In [17]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://www.baseball-reference.com/bpv/index.php/2006_World_Baseball_Classic_(Rosters)"
response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

# Find the table containing Australia's players
australia_table = soup.find_all("table", class_="sortable")[0]

# Extract the player data
rows = australia_table.find_all("tr")
player_data = []
for row in rows[1:]:
    cells = row.find_all("td")
    player = [cell.text.strip() for cell in cells]
    player_data.append(player)

# Write the player data to a CSV file
with open("australia_players.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Position", "Uniform", "Name", "Bats", "Throws", "Height", "Weight", "Birthdate", "Birthplace", "2006 Team"])
    writer.writerows(player_data)


IndexError: list index out of range