In [1]:
import json
from bs4 import BeautifulSoup

import re

import pickle
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from random import uniform
from time import sleep

In [2]:
WRESTLING_URL = 'https://www.trackwrestling.com'

In [3]:
# Aiming to place scraped data in its own folder
OUTPUT_FILE = './ScrapedMatchData/match_info_{}.json'

In [4]:
### Notes:
# Events page caps at 250 events, which is roughly the last week. Can't keep scrolling like the indeed page number code
# Maybe need to send specific search params by date?

In [5]:
### Remote browsing through selenium
### MAKE SURE TO USE SLEEP SO YOU DON'T GET BLOCKED!!!

driver = webdriver.Firefox()
driver.implicitly_wait(1.5) # force wait time (need to check documentation)
driver.get(WRESTLING_URL)

In [6]:
### Following cells navigate from trackwrestling landing page to New York State events pages

In [7]:
driver.execute_script("displayMenu('subMenu-browse', 'left', 1, 'Browse')")
sleep(5)

In [8]:
driver.execute_script("displayMenu('subMenu-seasons', 'left', 2, 'Seasons')")
sleep(3)

In [9]:
driver.execute_script("displayMenu('subMenu-seasonOrganizeBy_1428400132', 'left', 3, '2019-20 High School Boys')")
sleep(3)

In [10]:
driver.execute_script("gotoBuildURLPage('seasons/LoadBalance.jsp?seasonId=1428400132&gbId=38&uname=&pword=')")
sleep(3)

In [11]:
### Save cookies
pickle.dump(driver.get_cookies(), open("cookies.pkl","wb"))

In [12]:
### Add cookies for page access
cookies = pickle.load(open("cookies.pkl", "rb"))
for cookie in cookies:
    driver.add_cookie(cookie)

In [13]:
# Compile regex for elements we want to find
frameDataGrid_regex = re.compile(r'initDataGrid\(50, true, \"(.*\]\])')
initDataGrid_regex = re.compile(r'initDataGrid\(1000, false, \"(.*\]\])')
eventName_regex = re.compile(r'drawPageHeader\(\"(.*)\"\)')
eventId_regex = re.compile(r'eventId=(.*)&')
#teamId_regex = re.compile(r'teamId=(.*)')
dualId_regex = re.compile(r'dualId=(.*)&')

In [14]:
### Scrapes info of each match for every event/dual in the last 250 entries to trackwrestling
### This runs first for initial data, then we use daily scrape notebook to update

# Look into adding more wait time when opening dual window
# Play with length of sleeps, try to minimize
# Current runtime: ~40 minutes

match_Id = 0
error_list = [] # list of dicts
empty_record_list = [] # nested list of bools

events_url = driver.current_url
default_window = driver.window_handles[0]

driver.switch_to.frame('PageFrame')

# Collect all event names and dates for timestamping
frame_html = driver.execute_script("return document.documentElement.outerHTML")
frame_data = frameDataGrid_regex.findall(frame_html)

# Remove control chars that make json.loads unhappy
frame_data = frame_data[0]
frame_data = re.sub(r'[\t\n\r\\]','',frame_data)
frame_data_list = json.loads(frame_data)

eventNames = [data[5].strip() for data in frame_data_list] # events have names, duals are blank
eventDates = [data[3] for data in frame_data_list] #string in YYYYMMDD format

# Show all 250 entries for easier scraping
max_value = driver.find_element_by_xpath("//input[@value='50']") # searches based on default value
max_value.send_keys(Keys.CONTROL,'a')
max_value.send_keys('250')
max_value.send_keys(Keys.ENTER)

for i in range(0,250): # check all 250 events on page
    
    # openEvent either opens the match results in new window or asks user to pick a specific team
    eventDate = eventDates[i]
    driver.execute_script("openEvent({})".format(i))
    sleep(uniform(0.5,0.7))
    
    if len(driver.window_handles) == 1: # user prompted; need to loop through various teams
        
        frame_html = driver.execute_script("return document.documentElement.outerHTML")
        frame_soup = BeautifulSoup(frame_html,'html.parser')
        
        all_links = [str(link.get('href')) for link in frame_soup.find_all('a')]
        max_link_length = max(map(len,all_links)) # the links we want will always be the longest in the page
        team_links = [link for link in all_links if len(link)==max_link_length]
        sleep(uniform(0.5,0.7))
        
        empties = [] # 1 if link is empty, 0 if it has data
        
        for link in team_links: 
                        
            driver.get(WRESTLING_URL+'/seasons/'+link)
            sleep(uniform(0.5,0.7))
            
            match_html = driver.execute_script("return document.documentElement.outerHTML")
            
            eventName = eventName_regex.findall(match_html)
                
            if not eventName: # error page, save error info and back out
                
                bad_url = driver.current_url
                temp_error_dict = {'URL':bad_url,'Event Number':i,'HTML':match_html} # can record more info as desired
                error_list.append(temp_error_dict)
                
                driver.execute_script("window.history.go(-1)")
                sleep(uniform(0.5,0.7))
                default_window = driver.window_handles[0]
                sleep(uniform(0.5,0.7))
                driver.switch_to.frame('PageFrame')
                
            else: # real page, all good
                
                eventName = eventName[0].strip()
                #teamId = teamId_regex.findall(link)[0]
                eventId = eventId_regex.findall(link)[0]
                dualId = None
            
                match_data = initDataGrid_regex.findall(match_html)
                
                if not match_data: # empty record, save to check later
                    
                    empties.append(1)
                
                else: # data exists, scrape it
                
                    empties.append(0)
                
                    # remove control chars that make json.loads unhappy
                    match_data = match_data[0]
                    match_data = re.sub(r'[\t\n\r\\]','',match_data)
                    match_data_list = json.loads(match_data)

                    # note order of appended items
                    for each_match in match_data_list:

                        each_match.append(eventName)
                        each_match.append(eventId)
                        each_match.append(dualId)
                        each_match.append(eventDate)

                    match_Id += 1

                    json.dump(match_data_list,
                             open(OUTPUT_FILE.format(match_Id),'w'))
                    
        # add empties to empty record list
        empty_record_list.append(empties)
        
        # return to events page ready to open next event
        driver.get(events_url)
        sleep(uniform(0.5,0.7))
        default_window = driver.window_handles[0]
        sleep(uniform(0.5,0.7))
        driver.switch_to.frame('PageFrame')
        sleep(uniform(0.5,0.7))
        
        # Show all 250 entries for easier scraping
        max_value = driver.find_element_by_xpath("//input[@value='50']")
        max_value.send_keys(Keys.CONTROL,'a')
        max_value.send_keys('250')
        max_value.send_keys(Keys.ENTER)

    else: # match info opened in new window
        
        driver.switch_to.window(default_window)
        driver.switch_to.default_content
        
        match_window = driver.window_handles[1] # remember new window
        sleep(uniform(0.5,0.7))
        
        driver.switch_to.window(match_window) # switch to new window
        sleep(uniform(0.5,0.7))
            
        match_html = driver.execute_script("return document.documentElement.outerHTML")
        link = driver.current_url

        eventName = eventName_regex.findall(match_html)
        
        if not eventName: # not all good, record error info
            
            temp_error_dict = {'URL':link,'Event Number':i,'HTML':match_html} # can record more info as desired
            error_list.append(temp_error_dict)
        
        else: # real page, all good
                
            eventName = eventName[0].strip()
            #teamId = teamId_regex.findall(link)[0]
            dualId = dualId_regex.findall(link)[0]
            eventId = None
            
            match_data = initDataGrid_regex.findall(match_html)
            
            if not match_data: # empty record, save to check later
                
                empty_record_list.append(1)
            
            else: # data exists, scrape it
                
                empty_record_list.append(0)
                
                # remove control chars that make json.loads unhappy
                match_data = match_data[0]
                match_data = re.sub(r'[\t\n\r\\]','',match_data)
                match_data_list = json.loads(match_data)

                # note order of appended items
                for each_match in match_data_list:

                    each_match.append(eventName)
                    each_match.append(eventId)
                    each_match.append(dualId)
                    each_match.append(eventDate)

                match_Id += 1

                json.dump(match_data_list,
                            open(OUTPUT_FILE.format(match_Id),'w'))
                
        # closes new window and returns to events page to open next event
        driver.close()
        driver.switch_to.window(default_window)
        sleep(uniform(0.5,0.7))
        

In [15]:
driver.quit()

In [16]:
# Save number of individual JSON files for later use

with open('./ScrapedMatchData/NumFiles.txt','w') as NumFile:
    NumFile.write(str(match_Id))
    

In [17]:
# Save names and dates of scraped events for later use

json.dump(eventNames,
            open('./ScrapedMatchData/ScrapedEventNames.json','w'))

json.dump(eventDates,
            open('./ScrapedMatchData/ScrapedEventDates.json','w'))


In [18]:
# Save empty record info so we know to check them for updates later

json.dump(empty_record_list,
            open('./ScrapedMatchData/EmptyRecords.json','w'))


In [19]:
# Save error info to investigate later

json.dump(error_list,
            open('./ScrapedMatchData/ScrapingErrors.json','w'))