In [1]:
#import libraries
import os 
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import calendar

In [None]:
# Create a directory to store the data
if not os.path.exists('WBC_data'):
    os.makedirs('WBC_data')


In [2]:
# Define the years of interest
years = [2006, 2009, 2013, 2017, 2023]

In [19]:


# Loop through each year and scrape the data
for year in years:
    print(f"Scraping data for {year}...")
    # Make a request to the Wikipedia page
    url = f"https://en.wikipedia.org/wiki/{year}_World_Baseball_Classic"
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the table containing the matchups
    table = soup.find("table", class_="wikitable")

    # Loop through each row in the table and extract the final score
    matchups = []
    for tr in table.find_all("tr")[1:]:
        # Extract the data from each cell in the row
        tds = tr.find_all("td")
        if len(tds) > 3:
            matchup = {
                "date": tds[0].text.strip(),
                "venue": tds[1].text.strip(),
                "matchup": tds[2].text.strip(),
                "score": tds[3].text.strip(),
            }
            matchups.append(matchup)
   

Scraping data for 2006...
Scraping data for 2009...
Scraping data for 2013...
Scraping data for 2017...
Scraping data for 2023...


In [None]:
 # Save the matchups data to a CSV file
    matchups_df = pd.DataFrame(matchups)
    matchups_df.to_csv(f'WBC_data/{year}_matchups.csv', index=False)

In [None]:
  # Find the table containing the pool composition
    table = soup.find("table", class_="wikitable")

In [None]:
 # Extract the pool composition
    pools = {}
    for tr in table.find_all("tr")[1:]:
        # Extract the data from each cell in the row
        tds = tr.find_all("td")
        pool = tds[0].text.strip()
        teams = [team.strip() for team in tds[1].text.strip().split("\n")]
        pools[pool] = teams

In [None]:
 # Save the pool composition data to a CSV file
    pools_df = pd.DataFrame(pools.items(), columns=["pool", "teams"])
    pools_df.to_csv(f'WBC_data/{year}_pools.csv', index=False)

    

    # Make predictions for the pool 
    # winners and match outcomes 
    # using machine learning models


# 2006 


In [195]:

# POOL COMPOSITION
pool_comp_06 = pd.read_csv('WBC_data/2006_pools_composition.csv')

#POOL 1 & 2 RESULTS
pool_1r_06 = pd.read_csv('WBC_data/2006_pool_1_results.csv')
pool_2r_06 = pd.read_csv('WBC_data/2006_pool_2_results.csv')

# POOL A,B,C,D RESULTS
pool_ar_06 = pd.read_csv('WBC_data/2006_pool_a_results.csv') 
pool_br_06 = pd.read_csv('WBC_data/2006_pool_b_results.csv')
pool_cr_06 = pd.read_csv('WBC_data/2006_pool_c_results.csv')
pool_dr_06 = pd.read_csv('WBC_data/2006_pool_d_results.csv')

# POOL A,B,C,D SUMMARY
pool_as_06 = pd.read_csv('WBC_data/2006_pool_a_summary.csv')
pool_bs_06 = pd.read_csv('WBC_data/2006_pool_b_summary.csv')
pool_cs_06 = pd.read_csv('WBC_data/2006_pool_c_summary.csv')
pool_ds_06 = pd.read_csv('WBC_data/2006_pool_d_summary.csv')

# INFORMATION TABLE
champ_round_06 = pd.read_csv('WBC_data/2006_champ_round.csv')
wbc_classic_team_06 = pd.read_csv('WBC_data/2006_classic_team.csv')
final_standings_06 = pd.read_csv('WBC_data/2006_final_standing.csv')
batting_leaders_06 = pd.read_csv('WBC_data/2006_leader_batting.csv')
pitching_leaders_06 = pd.read_csv('WBC_data/2006_leader_pitching.csv')
wbc_champions_06 = pd.read_csv('WBC_data/2006_wbc_champions.csv')

# VENUES
venues_a_06 = pd.read_csv('WBC_data/2006_venues_pool_1.csv')
venues_b_06 = pd.read_csv('WBC_data/2006_venues_pool_2.csv')




# 2009


In [196]:

# POOL COMPOSITION
pool_comp_09 = pd.read_csv('WBC_data/2009_pool_composition.csv')

#POOL 1 & 2 RESULTS
pool_1r_09 = pd.read_csv('WBC_data/2009_pool_1_results.csv')
pool_2r_09 = pd.read_csv('WBC_data/2009_pool_2_results.csv')

# POOL A,B,C,D RESULTS
pool_ar_09 = pd.read_csv('WBC_data/2009_pool_a_results.csv') 
pool_br_09 = pd.read_csv('WBC_data/2009_pool_b_results.csv')
pool_cr_09 = pd.read_csv('WBC_data/2009_pool_c_results.csv')
pool_dr_09 = pd.read_csv('WBC_data/2009_pool_d_results.csv')

semifinals_09 = pd.read_csv('WBC_data/2009_semifinal.csv')
champ_round_09 = pd.read_csv('WBC_data/2009_final_championship.csv')

# INFORMATION TABLE
wbc_classic_team_09 = pd.read_csv('WBC_data/2009_classic_team.csv')
final_standings_09 = pd.read_csv('WBC_data/2009_final_standing.csv')
batting_leaders_09 = pd.read_csv('WBC_data/2009_batting_leaders.csv')
pitching_leaders_09 = pd.read_csv('WBC_data/2009_pitching_leaders.csv')
wbc_champions_09 = pd.read_csv('WBC_data/2009_champ_info.csv')

# VENUES
venues_a_09 = pd.read_csv('WBC_data/2009_venues_pool_1.csv')
venues_b_09 = pd.read_csv('WBC_data/2009_venues_pool_2.csv')



# 2013


In [197]:

# POOL COMPOSITION
pool_comp_13 = pd.read_csv('WBC_data/2013_pool_composition.csv')

# POOL 1 & 2 RESULTS
pool_1r_13 = pd.read_csv('WBC_data/2013_pool_1_results.csv')
pool_2r_13 = pd.read_csv('WBC_data/2013_pool_2_results.csv')

# POOL A,B,C,D RESULTS
pool_ar_13 = pd.read_csv('WBC_data/2013_pool_a_results.csv') 
pool_br_13 = pd.read_csv('WBC_data/2013_pool_b_results.csv')
pool_cr_13 = pd.read_csv('WBC_data/2013_pool_c_results.csv')
pool_dr_13 = pd.read_csv('WBC_data/2013_pool_d_results.csv')

# POOL A,B,C,D SUMMARY
pool_as_13 = pd.read_csv('WBC_data/2013_pool_a_summary.csv')
pool_bs_13 = pd.read_csv('WBC_data/2013_pool_b_summary.csv')
pool_cs_13 = pd.read_csv('WBC_data/2013_pool_c_summary.csv')
pool_ds_13 = pd.read_csv('WBC_data/2013_pool_d_summary.csv')

semifinals_13 = pd.read_csv('WBC_data/2013_semifinals.csv')
champ_round_13 = pd.read_csv('WBC_data/2013_final.csv')

# INFORMATION TABLE
wbc_classic_team_13 = pd.read_csv('WBC_data/2013_classic_team.csv')
final_standings_13 = pd.read_csv('WBC_data/2013_final_standing.csv')
batting_leaders_13 = pd.read_csv('WBC_data/2013_batting_leaders.csv')
pitching_leaders_13 = pd.read_csv('WBC_data/2013_pitching_leaders.csv')
wbc_champions_13 = pd.read_csv('WBC_data/2013_champ_info.csv')
qual_info_13 = pd.read_csv('WBC_data/2013_qualifications_info.csv')

# VENUES
venues_a_13 = pd.read_csv('WBC_data/2013_venues_1.csv')
venues_b_13 = pd.read_csv('WBC_data/2013_venues_2.csv')



# 2017


In [198]:
#17
# QUALIFICATIONS INFO
quals_info_17 = pd.read_csv('WBC_data/2017_qualifications_info.csv')

# POOL COMPOSITION
pool_comp_17 = pd.read_csv('WBC_data/2017_pool_composition.csv')


# POOL A,B,C,D RESULTS
pool_ar_17 = pd.read_csv('WBC_data/2017_pool_a_results.csv') 
pool_br_17 = pd.read_csv('WBC_data/2017_pool_b_results.csv')
pool_cr_17 = pd.read_csv('WBC_data/2017_pool_c_results.csv')
pool_dr_17 = pd.read_csv('WBC_data/2017_pool_d_results.csv')
pool_er_17 = pd.read_csv('WBC_data/2017_pool_e_results.csv')
pool_fr_17 = pd.read_csv('WBC_data/2017_pool_f_results.csv')

# POOL A,B,C,D SUMMARY
pool_as_17 = pd.read_csv('WBC_data/2017_pool_a_summary.csv') 
pool_bs_17 = pd.read_csv('WBC_data/2017_pool_b_summary.csv')
pool_cs_17 = pd.read_csv('WBC_data/2017_pool_c_summary.csv')
pool_ds_17 = pd.read_csv('WBC_data/2017_pool_d_summary.csv')
pool_es_17 = pd.read_csv('WBC_data/2017_2rnd_pool_E_summary.csv')
pool_fs_17 = pd.read_csv('WBC_data/2017_pool_f_summary.csv')


semifinals_17 = pd.read_csv('WBC_data/2017_semifinals.csv')
champ_round_17 = pd.read_csv('WBC_data/2017_final_championship.csv')

# INFORMATION TABLE
wbc_classic_team_17 = pd.read_csv('WBC_data/2017_classic_team.csv')
final_standings_17 = pd.read_csv('WBC_data/2017_final_standings.csv')
batting_leaders_17 = pd.read_csv('WBC_data/2017_batting_leaders.csv')
pitching_leaders_17 = pd.read_csv('WBC_data/2017_pitching_leaders.csv')
wbc_champions_17 = pd.read_csv('WBC_data/2017_champ_info.csv')

# VENUES
venues_a_17 = pd.read_csv('WBC_data/2017_venues_1.csv')
venues_b_17 = pd.read_csv('WBC_data/2017_venues_2.csv')

# BROADCAST INFO
broadcast_tv_17 = pd.read_csv('WBC_data/2017_broadcast_tv.csv')
broadcast_radio_17 = pd.read_csv('WBC_data/2017_broadcast_radio.csv')


# 2023


In [199]:
#17
# QUALIFICATIONS INFO
quals_info_23 = pd.read_csv('WBC_data/2023_qualified_info.csv')

# POOL COMPOSITION
pool_comp_23 = pd.read_csv('WBC_data/2023_pool_composition.csv')


# POOL A,B,C,D RESULTS
pool_ar_23 = pd.read_csv('WBC_data/2023_pool_a_results.csv') 
pool_br_23 = pd.read_csv('WBC_data/2023_pool_b_results.csv')
pool_cr_23 = pd.read_csv('WBC_data/2023_pool_c_results.csv')
pool_dr_23 = pd.read_csv('WBC_data/2023_pool_d_results.csv')

# POOL A,B,C,D SUMMARY
pool_as_23 = pd.read_csv('WBC_data/2023_pool_a_summary.csv') 
pool_bs_23 = pd.read_csv('WBC_data/2023_pool_b_summary.csv')
pool_cs_23 = pd.read_csv('WBC_data/2023_pool_c_summary.csv')
pool_ds_23 = pd.read_csv('WBC_data/2023_pool_d_summary.csv')



quarterfinals_23 = pd.read_csv('WBC_data/2023_quarterfinals.csv')
semifinals_23 = pd.read_csv('WBC_data/2023_semifinals.csv')
champ_round_23 = pd.read_csv('WBC_data/2023_finals_championship.csv')

# INFORMATION TABLE


# VENUES
venues_a_23 = pd.read_csv('WBC_data/2023_venues_1.csv')

# BASECAMP INFO
base_camp_a_23 = pd.read_csv('WBC_data/2023_team_base_camp.csv')
base_camp_b_23 = pd.read_csv('WBC_data/2023_team_base_camp_2.csv')

In [209]:
pool_dr_06

Unnamed: 0,Date,Local Time,Road Team,Score,Home Team,Inn.,Venue,Game Time,Attendance,Boxscore
0,"Mar 7, 2006",13:00,Dominican Republic,11–5,Venezuela,,Cracker Jack Stadium,3:16,10645,Boxscore
1,"Mar 7, 2006",20:00,Australia,0–10,Italy,7.0,Cracker Jack Stadium,2:16,8099,Boxscore
2,"Mar 8, 2006",19:00,Italy,0–6,Venezuela,,Cracker Jack Stadium,2:48,10101,Boxscore
3,"Mar 9, 2006",13:00,Italy,3–8,Dominican Republic,,Cracker Jack Stadium,2:39,9949,Boxscore
4,"Mar 9, 2006",20:00,Venezuela,2–0,Australia,,Cracker Jack Stadium,2:45,10111,Boxscore
5,"Mar 10, 2006",19:00,Australia,4–6,Dominican Republic,,Cracker Jack Stadium,2:52,11083,Boxscore


In [210]:
pool_dr_09

Unnamed: 0,Date,Local Time,Road Team,Score,Home Team,Inn.,Venue,Game Time,Attendance,Boxscore
0,"Mar 7, 2009",12:00,Netherlands,3–2,Dominican Republic,,Hiram Bithorn Stadium,3:01,9335,Boxscore
1,"Mar 7, 2009",18:00,Panama,0–7,Puerto Rico,,Hiram Bithorn Stadium,2:57,17348,Boxscore
2,"Mar 8, 2009",16:30,Panama,0–9,Dominican Republic,,Hiram Bithorn Stadium,2:46,9221,Boxscore
3,"Mar 9, 2009",18:30,Netherlands,1–3,Puerto Rico,,Hiram Bithorn Stadium,3:11,19479,Boxscore
4,"Mar 10, 2009",18:30,Dominican Republic,1–2,Netherlands,11.0,Hiram Bithorn Stadium,3:38,11814,Boxscore
5,"Mar 11, 2009",17:30,Netherlands,0–5,Puerto Rico,,Hiram Bithorn Stadium,2:55,19501,Boxscore


In [211]:
pool_dr_13

Unnamed: 0,Date,Local Time,Road Team,Score,Home Team,Inn.,Venue,Game Time,Attendance,Boxscore
0,"Mar 7, 2013",13:00,Italy,6–5,Mexico,,Salt River Fields at Talking Stick,3:41,4478,Boxscore
1,"Mar 8, 2013",12:00,Canada,4–14,Italy,8.0,Chase Field[note 1],3:27,5140,Boxscore
2,"Mar 8, 2013",19:00,Mexico,5–2,United States,,Chase Field,3:29,44256,Boxscore
3,"Mar 9, 2013",12:30,Canada,10–3,Mexico,,Chase Field,3:44,19581,Boxscore
4,"Mar 9, 2013",19:00,United States,6–2,Italy,,Chase Field,3:21,19303,Boxscore
5,"Mar 10, 2013",13:00,United States,9–4,Canada,,Chase Field,3:18,22425,Boxscore


In [212]:
pool_dr_17

Unnamed: 0,Date,Local Time,Road Team,Score,Home Team,Inn.,Venue,Game Time,Attendance,Boxscore
0,"Mar 9, 2017",20:00,Mexico,9–10,Italy,,Estadio Charros de Jalisco,3:39,14296,Boxscore
1,"Mar 10, 2017",20:00,Venezuela,0–11,Puerto Rico,7.0,Estadio Charros de Jalisco,2:43,14806,Boxscore
2,"Mar 11, 2017",14:00,Venezuela,11–10,Italy,10.0,Estadio Charros de Jalisco,4:43,12187,Boxscore
3,"Mar 11, 2017",20:30,Puerto Rico,9–4,Mexico,,Estadio Charros de Jalisco,3:40,15647,Boxscore
4,"Mar 12, 2017",13:30,Italy,3–9,Puerto Rico,,Estadio Charros de Jalisco,2:42,11924,Boxscore
5,"Mar 12, 2017",20:00,Mexico,11–9,Venezuela,,Estadio Charros de Jalisco,4:44,15489,Boxscore


In [213]:
pool_dr_23

Unnamed: 0,Date,Local Time,Road Team,Score,Home Team,Inn.,Venue,Game Time,Attendance,Boxscore
0,"Mar 11, 2023",12:00,Nicaragua,–,Puerto Rico,,LoanDepot Park,,,Boxscore
1,"Mar 11, 2023",19:00,Dominican Republic,–,Venezuela,,LoanDepot Park,,,Boxscore
2,"Mar 12, 2023",12:00,Nicaragua,–,Israel,,LoanDepot Park,,,Boxscore
3,"Mar 12, 2023",19:00,Venezuela,–,Puerto Rico,,LoanDepot Park,,,Boxscore
4,"Mar 13, 2023",12:00,Dominican Republic,–,Nicaragua,,LoanDepot Park,,,Boxscore
5,"Mar 13, 2023",19:00,Israel,–,Puerto Rico,,LoanDepot Park,,,Boxscore
6,"Mar 14, 2023",12:00,Nicaragua,–,Venezuela,,LoanDepot Park,,,Boxscore
7,"Mar 14, 2023",19:00,Israel,–,Dominican Republic,,LoanDepot Park,,,Boxscore
8,"Mar 15, 2023",12:00,Venezuela,–,Israel,,LoanDepot Park,,,Boxscore
9,"Mar 15, 2023",19:00,Puerto Rico,–,Dominican Republic,,LoanDepot Park,,,Boxscore
