# Final Scraping Code

In [1]:
import time
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
import pandas as pd

def go_to_commentary(start_url,num,pg_num):
    '''
    function to automate mouse operations that lead from start_url to the commentary for all matches on that webpage

    '''
    df = pd.DataFrame(columns=['Date', 'Venue', 'Team_1', 'Team_2', 'Innings_no', 'Overs', 'Ball_Event', 'Comment'])

    # Iterating over alternate rows present on that webpage (matches are repeated)
    for i in range(1,num,2):
        driver = webdriver.Chrome(ChromeDriverManager().install())
        act = ActionChains(driver)
        driver.get(start_url)

        # Removing ads from the RHS of the webpage using javascript code
        ad = driver.find_element_by_id('ciHomeContentrhs')
        driver.execute_script("""
        var element = arguments[0];
        element.parentNode.removeChild(element);
        """, ad)

        block = driver.find_element_by_xpath("//*[@id='ciHomeContentlhs']/div[3]/table[3]")
        elements = block.find_elements_by_tag_name('img') # finds all images in the specified table
        # NOTE: first image is the black up arrow, use from the second element to get all the hidden menu image icons

        act.move_to_element(elements[i]).perform()
        scorecard = driver.find_element_by_link_text('Match scorecard')
        act.click(scorecard).perform()         # clicks match scorecard link
        elt = driver.find_element_by_class_name("cscore_link--button")
        status = elt.find_element_by_class_name("cscore_time")
        print(status.text)

        # Leaving out abandoned and no-result matches
        if status.text=="Result":
            stadium = driver.find_element_by_xpath \
                ("//*[@id='main-container']/div/div[3]/div[1]/div[6]/article/div/div/div[1]/h4/a/span")
            stadium = stadium.text             # getting venue details
            elt = driver.find_element_by_class_name("cscore_link--button")
            date = elt.find_element_by_class_name("cscore_info-overview")
            date_text = date.text.split(',')[2]  # getting date
            commentary = driver.find_element_by_link_text('Commentary')
            act.click(commentary).perform()        # clicks commentary link
            df_match = scrape_data(driver.current_url,stadium,date_text) # function call to scrape_data() for every match
            driver.quit()
            df = pd.concat([df, df_match])
            print("Rows in dataframe = ",df.shape[0])

        if i>=num-2:    #all matches on a single page have been covered
            print('Saving to csv file')
            file_path = '/Users/murali/code/gyandata/table'+str(pg_num)+'.csv'
            df.to_csv(file_path)


def scrape_data(url,stadium,date):
    '''
    function to visit the url passed and scrape the commentary for every ball for both the innings and return a df

    '''

    df = pd.DataFrame(columns=['Date', 'Venue', 'Team_1', 'Team_2', 'Innings_no', 'Overs', 'Ball_Event', 'Comment'])
    team_abbrev = {'South Africa': 'SA', 'Australia': 'AUS', 'Pakistan': 'PAK', 'West Indies': 'WI', 'Sri Lanka': 'SL',
                    'England':'ENG','Bangladesh':'BDESH','New Zealand':'NZ','Afghanistan':'AFG','India':'INDIA'}
    driver2 = webdriver.Chrome(ChromeDriverManager().install())
    driver2.get(url)

    # Obtaining team names
    teams = driver2.find_elements_by_class_name('cscore_name--long')
    team_names = []
    for team in teams:
        if len(team.text)!=0:
            team_names.append(team.text)
    time.sleep(2)

    #leaving out matches of teams other than those in team_abbrev dictionary
    if team_names[0] in team_abbrev.keys() and team_names[1] in team_abbrev.keys():
        # scrolling till we reach the end of the page
        driver2.maximize_window()
        elm = driver2.find_element_by_tag_name('html')
        for i in range(15):
            elm.send_keys(Keys.END)
            time.sleep(2)

        # Code to extract ball-by-ball status and commentary (first for innings 2)
        innings = 2
        wrappers = driver2.find_elements_by_class_name('item-wrapper')
        print(len(wrappers))
        for wrapper in wrappers:
            over = wrapper.find_element_by_class_name('time-stamp')
            over_value = over.text
            ball_score = wrapper.find_element_by_class_name('over-score')
            ball_event = ball_score.text
            comment = wrapper.find_element_by_class_name('description')
            comment_made = comment.text
            innings_data = [date,stadium,team_names[0],team_names[1],innings,over_value,ball_event,comment_made]
            row = pd.Series(dict(zip(df.columns,innings_data)))
            df = df.append(row,ignore_index=True)

        new_url = driver2.current_url + '?innings=1'
        driver2.get(new_url)
        driver2.maximize_window()

        # scrolling till we reach the end of the page
        elm = driver2.find_element_by_tag_name('html')
        for i in range(15):
            elm.send_keys(Keys.END)
            time.sleep(2)

        # Code to extract ball-by-ball status and commentary (now for innings 1)
        innings = 1
        wrappers = driver2.find_elements_by_class_name('item-wrapper')
        print(len(wrappers))
        for wrapper in wrappers:
            over = wrapper.find_element_by_class_name('time-stamp')
            over_value = over.text
            ball_score = wrapper.find_element_by_class_name('over-score')
            ball_event = ball_score.text
            comment = wrapper.find_element_by_class_name('description')
            comment_made = comment.text
            innings_data = [date,stadium,team_names[0],team_names[1],innings,over_value,ball_event,comment_made]
            row = pd.Series(dict(zip(df.columns,innings_data)))
            df = df.append(row,ignore_index=True)
    driver2.quit()
    print(df.head())
    return df



In [3]:
num_matches_per_page = 50
for page_no in range(170,172):
    start_url = 'http://stats.espncricinfo.com/ci/engine/stats/index.html?class=2;orderby=start;page='+str(page_no)+';template=results;type=team;view=results'
    go_to_commentary(start_url,num_matches_per_page,page_no)
    print("csv file created for page ",page_no)



Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
Result

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
307
294
          Date                                              Venue Team_1  \
0   Mar 5 2019  Vidarbha Cricket Association Stadium, Jamtha, ...  India   
1   Mar 5 2019  Vidarbha Cricket Association Stadium, Jamtha, ...  India   
2   Mar 5 2019  Vidarbha Cricket Association Stadium, Jamtha, ...  India   
3   Mar 5 2019  Vidarbha Cricket Association Stadium, Jamtha, ...  India   
4   Mar 5 2019  Vidarbha Cricket Association Stadium, Jamtha, ...  India   

      Team_2 Innings_no Overs Ball_Event  \
0  Australia          2  49.3          W   
1  Australia          2  49.2          2   
2  Australia          2  49.1          W   
3  Australia          2  48.6          4   
4  Australia          2  48.5         

4          1  Stoinis to Kuldeep Yadav, 1 run, clobbered, on...  
Rows in dataframe =  3335

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
Result

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
199
244
           Date                             Venue     Team_1        Team_2  \
0   Mar 13 2019  St George's Park, Port Elizabeth  Sri Lanka  South Africa   
1   Mar 13 2019  St George's Park, Port Elizabeth  Sri Lanka  South Africa   
2   Mar 13 2019  St George's Park, Port Elizabeth  Sri Lanka  South Africa   
3   Mar 13 2019  St George's Park, Port Elizabeth  Sri Lanka  South Africa   
4   Mar 13 2019  St George's Park, Port Elizabeth  Sri Lanka  South Africa   

  Innings_no Overs Ball_Event  \
0          2  32.5        1lb   
1          2  32.4          1   
2          2  32.3          0   
3          2  32.2    

4  Richardson to Imad Wasim, 1 wide, too short, t...  
Rows in dataframe =  7246

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
Result

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
Empty DataFrame
Columns: [Date, Venue, Team_1, Team_2, Innings_no, Overs, Ball_Event, Comment]
Index: []
Rows in dataframe =  7246

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
Result

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
Empty DataFrame
Columns: [Date, Venue, Team_1, Team_2, Innings_no, Overs, Ball_Event, Comment]
Index: []
Rows in dataframe =  7246

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/

4  Roach to Mahmudullah, no run, fullish, angling...  
Rows in dataframe =  1212

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
Result

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
278
309
           Date                   Venue    Team_1   Team_2 Innings_no Overs  \
0   May 14 2019  County Ground, Bristol  Pakistan  England          2  44.5   
1   May 14 2019  County Ground, Bristol  Pakistan  England          2  44.4   
2   May 14 2019  County Ground, Bristol  Pakistan  England          2  44.3   
3   May 14 2019  County Ground, Bristol  Pakistan  England          2  44.2   
4   May 14 2019  County Ground, Bristol  Pakistan  England          2  44.1   

  Ball_Event                                            Comment  
0          1  Asif Ali to Morgan, 1 run, pushed into the cov...  
1          1  Asif Ali to A

4  13.1         1w  Wahab Riaz to Hetmyer, 1 wide, shapes to pull ...  
Rows in dataframe =  4058

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
Result

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
100
186
          Date                    Venue     Team_1       Team_2 Innings_no  \
0   Jun 1 2019  Sophia Gardens, Cardiff  Sri Lanka  New Zealand          2   
1   Jun 1 2019  Sophia Gardens, Cardiff  Sri Lanka  New Zealand          2   
2   Jun 1 2019  Sophia Gardens, Cardiff  Sri Lanka  New Zealand          2   
3   Jun 1 2019  Sophia Gardens, Cardiff  Sri Lanka  New Zealand          2   
4   Jun 1 2019  Sophia Gardens, Cardiff  Sri Lanka  New Zealand          2   

  Overs Ball_Event                                            Comment  
0  16.1          1  BMAJ Mendis to Munro, 1 run, gentle nudge behi...  
1  1

4  46.5          0  Mohammad Saifuddin to Ferguson, no run, length...  
Rows in dataframe =  7663

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
Result

Checking for mac64 chromedriver:75.0.3770.90 in cache
Driver found in /Users/murali/.wdm/chromedriver/75.0.3770.90/mac64/chromedriver
307
315
          Date                     Venue     Team_1       Team_2 Innings_no  \
0   Jun 6 2019  Trent Bridge, Nottingham  Australia  West Indies          2   
1   Jun 6 2019  Trent Bridge, Nottingham  Australia  West Indies          2   
2   Jun 6 2019  Trent Bridge, Nottingham  Australia  West Indies          2   
3   Jun 6 2019  Trent Bridge, Nottingham  Australia  West Indies          2   
4   Jun 6 2019  Trent Bridge, Nottingham  Australia  West Indies          2   

  Overs Ball_Event                                            Comment  
0  49.6          4  Coulter-Nile to Nurse, FOUR runs, another firm... 