# Scraping Baseball Match Data

## Importing libraries

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import numpy as np
import json
import csv
import re
import time
from pandas.errors import EmptyDataError
pd.options.display.max_columns = 100

## Opening the webpage

In [28]:
driver = webdriver.Chrome(r"chromedriver.exe")
driver.get('https://www.flashscore.com/baseball/')

  """Entry point for launching an IPython kernel.


In [29]:
driver.find_element(by='class name', value='lmc__itemMore').click()

## Collecting the links of baseball leagues

In [30]:
def league_links():
    countries = driver.find_elements(by='class name', value='lmc__elementName')
    leagues = []
    for i in range(len(countries)):
        countries[i].click()
        time.sleep(1)

    league = driver.find_elements(by='class name', value='lmc__templateHref')

    for j in range(len(league)):
        leagues.append(league[j].get_attribute('href') + 'archive')
        
    return leagues
    
leagues = league_links()
leagues[:5]

['https://www.flashscore.com/baseball/australia/abl/archive',
 'https://www.flashscore.com/baseball/austria/bundesliga/archive',
 'https://www.flashscore.com/baseball/belgium/division-1/archive',
 'https://www.flashscore.com/baseball/colombia/lpb/archive',
 'https://www.flashscore.com/baseball/cuba/serie-nacional/archive']

In [31]:
# Saving the leagues links
with open(r'leagues.json', 'w') as f:
    json.dump(leagues, f)

## Collecting the links of last three seasons of each league

In [32]:
def seasons_links(leagues):
    prefix = 'https://www.flashscore.com'
    seasons = []
    for league in leagues:
        driver.get(league)
        html = BeautifulSoup(driver.page_source)
        season = html.find_all(class_="archive__text--clickable")
        try:
            if 'baseball' in season[0]['href']:
                seasons.append(prefix + season[0]['href'])
            if 'baseball' in season[1]['href']:
                seasons.append(prefix + season[1]['href'])
            if 'baseball' in season[2]['href']:
                seasons.append(prefix + season[2]['href'])
        except IndexError:
            continue
    return seasons

seasons = seasons_links(leagues)
seasons[:5]

['https://www.flashscore.com/baseball/australia/abl/',
 'https://www.flashscore.com/baseball/australia/abl-2020-2021/',
 'https://www.flashscore.com/baseball/austria/bundesliga/',
 'https://www.flashscore.com/baseball/austria/bundesliga-2022/',
 'https://www.flashscore.com/baseball/belgium/division-1/']

In [33]:
# Saving the seasons links
with open(r'seasons.json', 'w') as f:
    json.dump(seasons, f)

## Collecting matches links

In [36]:
def match_links(seasons):
    match_links = []
    for s in seasons:
        driver.get(s)
        for j in range(100):
            try:
                time.sleep(2)
                event_more = driver.find_element(by = 'class name', value = 'event__more--static')
                time.sleep(2)
                event_more.click()
            except NoSuchElementException:
                break
        match_ids = []    
        matches = driver.find_elements(by = 'class name', value = 'event__match')
        for k in range(len(matches)):
            match_ids.append(matches[k].get_attribute('id'))

        for l in range(len(match_ids)):
            match_ids[l] = match_ids[l].replace('g_6_', '')
            match_links.append('https://www.flashscore.com/match/' + match_ids[l] + '/#/match-summary/match-summary')
    return match_links

matches = match_links(seasons)
matches[:5]

['https://www.flashscore.com/match/Ycjgf10K/#/match-summary/match-summary',
 'https://www.flashscore.com/match/dK5rrRWT/#/match-summary/match-summary',
 'https://www.flashscore.com/match/2c6vq7HN/#/match-summary/match-summary',
 'https://www.flashscore.com/match/IeaPWN9A/#/match-summary/match-summary',
 'https://www.flashscore.com/match/INHPdrWG/#/match-summary/match-summary']

In [38]:
# Saving the matches links
with open(r'matches.json', 'w') as f:
    json.dump(matches, f)

## Scraping matches information

In [43]:
def scrape_match_info(urls):
    rows = []
    for url in urls:
        driver.get(url)
        table = BeautifulSoup(driver.page_source)
        time.sleep(1)
        try:
            league = driver.find_element(by = 'class name', value = 'tournamentHeader__country').text
            team1 = driver.find_elements(by = 'class name', value = 'smh__participantName')[0].text
            team2 = driver.find_elements(by = 'class name', value = 'smh__participantName')[1].text
            date = driver.find_element(by = 'class name', value = 'duelParticipant__startTime').text

            final_score1 = driver.find_elements(by = 'class name', value = 'smh__part--current')[0].text
            final_score2 = driver.find_elements(by = 'class name', value = 'smh__part--current')[1].text
            inning_1_score1 = driver.find_elements(by = 'class name', value = 'smh__part--1')[0].text
            inning_1_score2 = driver.find_elements(by = 'class name', value = 'smh__part--1')[1].text
            inning_2_score1 = driver.find_elements(by = 'class name', value = 'smh__part--2')[0].text
            inning_2_score2 = driver.find_elements(by = 'class name', value = 'smh__part--2')[1].text
            inning_3_score1 = driver.find_elements(by = 'class name', value = 'smh__part--3')[0].text
            inning_3_score2 = driver.find_elements(by = 'class name', value = 'smh__part--3')[1].text
            inning_4_score1 = driver.find_elements(by = 'class name', value = 'smh__part--4')[0].text
            inning_4_score2 = driver.find_elements(by = 'class name', value = 'smh__part--4')[1].text
            inning_5_score1 = driver.find_elements(by = 'class name', value = 'smh__part--5')[0].text
            inning_5_score2 = driver.find_elements(by = 'class name', value = 'smh__part--5')[1].text
            inning_6_score1 = driver.find_elements(by = 'class name', value = 'smh__part--6')[0].text
            inning_6_score2 = driver.find_elements(by = 'class name', value = 'smh__part--6')[1].text
            inning_7_score1 = driver.find_elements(by = 'class name', value = 'smh__part--7')[0].text
            inning_7_score2 = driver.find_elements(by = 'class name', value = 'smh__part--7')[1].text
            inning_8_score1 = driver.find_elements(by = 'class name', value = 'smh__part--8')[0].text
            inning_8_score2 = driver.find_elements(by = 'class name', value = 'smh__part--8')[1].text
            inning_9_score1 = driver.find_elements(by = 'class name', value = 'smh__part--9')[0].text
            inning_9_score2 = driver.find_elements(by = 'class name', value = 'smh__part--9')[1].text
            inning_EI_score1 = driver.find_elements(by = 'class name', value = 'smh__part--x')[0].text
            inning_EI_score2 = driver.find_elements(by = 'class name', value = 'smh__part--x')[1].text
        except IndexError:
            league = ""
            team1 = ""
            team2 = ""
            date = ""

            final_score1 = ""
            final_score2 = ""
            inning_1_score1 = ""
            inning_1_score2 = ""
            inning_2_score1 = ""
            inning_2_score2 = ""
            inning_3_score1 = ""
            inning_3_score2 = ""
            inning_4_score1 = ""
            inning_4_score2 = ""
            inning_5_score1 = ""
            inning_5_score2 = ""
            inning_6_score1 = ""
            inning_6_score2 = ""
            inning_7_score1 = ""
            inning_7_score2 = ""
            inning_8_score1 = ""
            inning_8_score2 = ""
            inning_9_score1 = ""
            inning_9_score2 = ""
            inning_EI_score1 = ""
            inning_EI_score2 = ""

        row = {'URL': url,
            'League': league, 
                   'Team1': team1, 
                   'Team2': team2,
                   'Date': date,
                   'Match Score Team1': final_score1,
                   'Match Score Team2': final_score2,
                  'Inning1 score Team1': inning_1_score1,
                  'Inning1 score Team2': inning_1_score2,
                  'Inning2 score Team1': inning_2_score1,
                  'Inning2 score Team2': inning_2_score2,
                  'Inning3 score Team1': inning_3_score1,
                  'Inning3 score Team2': inning_3_score2,
                  'Inning4 score Team1': inning_4_score1,
                  'Inning4 score Team2': inning_4_score2,
                  'Inning5 score Team1': inning_5_score1,
                  'Inning5 score Team2': inning_5_score2,
                  'Inning6 score Team1': inning_6_score1,
                  'Inning6 score Team2': inning_6_score2,
                  'Inning7 score Team1': inning_7_score1,
                  'Inning7 score Team2': inning_7_score2,
                  'Inning8 score Team1': inning_8_score1,
                  'Inning8 score Team2': inning_8_score2,
                  'Inning9 score Team1': inning_9_score1,
                  'Inning9 score Team2': inning_9_score2,
                  'Extra Inning score Team1': inning_EI_score1,
                  'Extra Inning score Team2': inning_EI_score2}

        rows.append(row)
    baseball_matches = pd.DataFrame(rows)
    return baseball_matches
    
baseball_matches = scrape_match_info(matches)
baseball_matches.head()  

Unnamed: 0,URL,League,Team1,Team2,Date,Match Score Team1,Match Score Team2,Inning1 score Team1,Inning1 score Team2,Inning2 score Team1,Inning2 score Team2,Inning3 score Team1,Inning3 score Team2,Inning4 score Team1,Inning4 score Team2,Inning5 score Team1,Inning5 score Team2,Inning6 score Team1,Inning6 score Team2,Inning7 score Team1,Inning7 score Team2,Inning8 score Team1,Inning8 score Team2,Inning9 score Team1,Inning9 score Team2,Extra Inning score Team1,Extra Inning score Team2
0,https://www.flashscore.com/match/Ycjgf10K/#/ma...,AUSTRALIA: ABL - PLAY OFFS - FINAL,Perth,Adelaide,05.02.2023 11:30,2,5,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,2,0,X,,
1,https://www.flashscore.com/match/dK5rrRWT/#/ma...,AUSTRALIA: ABL - PLAY OFFS - FINAL,Perth,Adelaide,04.02.2023 12:30,2,9,0,3,0,0,1,0,0,4,1,0,0,2,0,0,0,0,0,0,,
2,https://www.flashscore.com/match/2c6vq7HN/#/ma...,AUSTRALIA: ABL - PLAY OFFS - FINAL,Adelaide,Perth,03.02.2023 15:00,5,9,0,2,0,0,0,0,0,4,2,1,1,1,2,0,0,1,0,X,,
3,https://www.flashscore.com/match/IeaPWN9A/#/ma...,AUSTRALIA: ABL - PLAY OFFS - SEMI-FINALS,Auckland,Adelaide,29.01.2023 11:30,2,4,0,0,0,0,0,2,0,0,0,2,0,0,0,0,1,0,1,X,,
4,https://www.flashscore.com/match/INHPdrWG/#/ma...,AUSTRALIA: ABL - PLAY OFFS - SEMI-FINALS,Perth,Brisbane,29.01.2023 08:00,10,8,0,3,7,0,0,0,0,0,0,0,2,3,0,1,1,0,0,1,,


In [44]:
# Saving matches information
baseball_matches.to_csv('Baseball_matches' + '.csv')