In [1]:
import json
from bs4 import BeautifulSoup

import re

import pickle
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from random import uniform
from time import sleep

In [2]:
WRESTLING_URL = 'https://www.trackwrestling.com'

In [3]:
# Aiming to place scraped data in its own folder
OUTPUT_FILE = './ScrapedMatchData/match_info_{}.json'

In [4]:
### Notes:
# This is the daily scrape notebook
# The goal here is to check TrackWrestling for new data entries,
# scrape them if they exist,
# compare them to the already scraped data (deduping),
# and finally add the new entries to the scraped data after they pass deduping checks

In [5]:
# Read in metadata from last scrape

with open('./ScrapedMatchData/ScrapedEventNames.json','r') as infile:
    oldEvents = json.load(infile) # Names of already scraped events

with open('./ScrapedMatchData/ScrapedEventDates.json','r') as infile:
    oldDates = json.load(infile) # Dates of already scraped events
    
with open('./ScrapedMatchData/ScrapingErrors.json','r') as infile:
    oldErrors = json.load(infile) # Errors from already scraped events

with open('./ScrapedMatchData/EmptyRecords.json','r') as infile:
    oldEmpties = json.load(infile) # Empty records from already scraped events

with open('./ScrapedMatchData/NumFiles.txt','r') as infile:
    oldFileNum = int(infile.read()) # Number of match data files saved

In [6]:
### Remote browsing through selenium
### MAKE SURE TO USE SLEEP SO YOU DON'T GET BLOCKED!!!

driver = webdriver.Firefox()
driver.implicitly_wait(1.5) # force wait time (need to check documentation)
driver.get(WRESTLING_URL)

In [7]:
### Following cells navigate from trackwrestling landing page to New York State events pages

In [8]:
driver.execute_script("displayMenu('subMenu-browse', 'left', 1, 'Browse')")
sleep(5)

In [9]:
driver.execute_script("displayMenu('subMenu-seasons', 'left', 2, 'Seasons')")
sleep(3)

In [10]:
driver.execute_script("displayMenu('subMenu-seasonOrganizeBy_1428400132', 'left', 3, '2019-20 High School Boys')")
sleep(3)

In [11]:
driver.execute_script("gotoBuildURLPage('seasons/LoadBalance.jsp?seasonId=1428400132&gbId=38&uname=&pword=')")
sleep(3)

In [12]:
### Save cookies
pickle.dump(driver.get_cookies(), open("cookies.pkl","wb"))

In [13]:
### Add cookies for page access
cookies = pickle.load(open("cookies.pkl", "rb"))
for cookie in cookies:
    driver.add_cookie(cookie)

In [14]:
# Compile regex for elements we want to find
frameDataGrid_regex = re.compile(r'initDataGrid\(50, true, \"(.*\]\])')
initDataGrid_regex = re.compile(r'initDataGrid\(1000, false, \"(.*\]\])')
eventName_regex = re.compile(r'drawPageHeader\(\"(.*)\"\)')
eventId_regex = re.compile(r'eventId=(.*)&')
#teamId_regex = re.compile(r'teamId=(.*)')
dualId_regex = re.compile(r'dualId=(.*)&')

In [15]:
### Scrapes events page data for comparison against last scrapes' data

events_url = driver.current_url
default_window = driver.window_handles[0]

driver.switch_to.frame('PageFrame')

# Collect all event names and dates
frame_html = driver.execute_script("return document.documentElement.outerHTML")
frame_data = frameDataGrid_regex.findall(frame_html)

# Remove control chars that make json.loads unhappy
frame_data = frame_data[0]
frame_data = re.sub(r'[\t\n\r\\]','',frame_data)
frame_data_list = json.loads(frame_data)

currentEvents = [data[5].strip() for data in frame_data_list] # events have names, duals are blank
currentDates = [data[3] for data in frame_data_list] #string in YYYYMMDD format

# Save another version of Events and Dates as sets (unique values only)
# Note: event names may only be unique within a season
currentEventsSet = set(currentEvents)
currentDatesSet = set(currentDates)

# Reload the page to get out of PageFrame
driver.get(events_url)
sleep(uniform(0.5,0.7))

In [16]:
# first: determine names of new events (and all the events that need to be scraped again)
# second: looks like openEvent(i) arg is 3 + (index of event name in currentEvents)
# third: use found openEvent(i) args to scrape only events we need to scrape
# fourth: save this new scraped data in the correct (or new) data file, based on openEvent(i) arg if need be

In [17]:
# New events
newEvents = list(currentEventsSet - set(oldEvents)) # new events to scrape
newEventIndex = [currentEvents.index(newEvent) for newEvent in newEvents] # positions of new events in currentEvents
newEventDates = [currentDates[i] for i in newEventIndex] # dates of new events in currentEvents

# Add new event names and dates to lists of event names/dates
# Note: maybe track this differently than appending to a list... may not need to though
eventNames = oldEvents + newEvents
eventDates = oldDates + newEventDates

In [18]:
# Check error info

In [19]:
# Check empty record info; maybe map empty records to their events in a dict for easier tracking?

In [20]:
# Compile into list of new indices we use to scrape via openEvent(i) java command
openEventIndex = [ind+3 for ind in newEventIndex] # just new events for now; +3 to match trackwrestling JavaScript

In [21]:
### Scrapes info of each match that is new or needs to be updated

# Look into adding more wait time when opening dual window
# Play with length of sleeps, try to minimize

match_Id = oldFileNum # start where we left off?
error_list = [] # list of dicts
empty_record_list = [] # nested list of bools

events_url = driver.current_url
default_window = driver.window_handles[0]

driver.switch_to.frame('PageFrame')

# Show all 250 entries for easier scraping
max_value = driver.find_element_by_xpath("//input[@value='50']") # searches based on default value
max_value.send_keys(Keys.CONTROL,'a')
max_value.send_keys('250')
max_value.send_keys(Keys.ENTER)

for i in openEventIndex: # check data that needs to get checked (new,empty,error,etc?)
    
    # openEvent either opens the match results in new window or asks user to pick a specific team
    eventDate = newEventDates[openEventIndex.index(i)]
    driver.execute_script("openEvent({})".format(i))
    sleep(uniform(0.5,0.7))
    
    if len(driver.window_handles) == 1: # user prompted; need to loop through various teams
        
        frame_html = driver.execute_script("return document.documentElement.outerHTML")
        frame_soup = BeautifulSoup(frame_html,'html.parser')
        
        all_links = [str(link.get('href')) for link in frame_soup.find_all('a')]
        max_link_length = max(map(len,all_links)) # the links we want will always be the longest in the page
        team_links = [link for link in all_links if len(link)==max_link_length]
        sleep(uniform(0.5,0.7))
        
        empties = [] # 1 if link is empty, 0 if it has data
        
        for link in team_links: 
                        
            driver.get(WRESTLING_URL+'/seasons/'+link)
            sleep(uniform(0.5,0.7))
            
            match_html = driver.execute_script("return document.documentElement.outerHTML")
            
            eventName = eventName_regex.findall(match_html)
                
            if not eventName: # error page, save error info and back out
                
                bad_url = driver.current_url
                temp_error_dict = {'URL':bad_url,'Event Number':i,'HTML':match_html} # can record more info as desired
                error_list.append(temp_error_dict)
                
                driver.execute_script("window.history.go(-1)")
                sleep(uniform(0.5,0.7))
                default_window = driver.window_handles[0]
                sleep(uniform(0.5,0.7))
                driver.switch_to.frame('PageFrame')
                
            else: # real page, all good
                
                eventName = eventName[0].strip()
                #teamId = teamId_regex.findall(link)[0]
                eventId = eventId_regex.findall(link)[0]
                dualId = None
            
                match_data = initDataGrid_regex.findall(match_html)
                
                if not match_data: # empty record, save to check later
                    
                    empties.append(1)
                
                else: # data exists, scrape it
                
                    empties.append(0)
                
                    # remove control chars that make json.loads unhappy
                    match_data = match_data[0]
                    match_data = re.sub(r'[\t\n\r\\]','',match_data)
                    match_data_list = json.loads(match_data)

                    # note order of appended items
                    for each_match in match_data_list:

                        each_match.append(eventName)
                        each_match.append(eventId)
                        each_match.append(dualId)
                        each_match.append(eventDate)

                    match_Id += 1

                    json.dump(match_data_list,
                             open(OUTPUT_FILE.format(match_Id),'w'))
                    
        # add empties to empty record list
        empty_record_list.append(empties)
        
        # return to events page ready to open next event
        driver.get(events_url)
        sleep(uniform(0.5,0.7))
        default_window = driver.window_handles[0]
        sleep(uniform(0.5,0.7))
        driver.switch_to.frame('PageFrame')
        sleep(uniform(0.5,0.7))
        
        # Show all 250 entries for easier scraping
        max_value = driver.find_element_by_xpath("//input[@value='50']")
        max_value.send_keys(Keys.CONTROL,'a')
        max_value.send_keys('250')
        max_value.send_keys(Keys.ENTER)

    else: # match info opened in new window
        
        driver.switch_to.window(default_window)
        driver.switch_to.default_content
        
        match_window = driver.window_handles[1] # remember new window
        sleep(uniform(0.5,0.7))
        
        driver.switch_to.window(match_window) # switch to new window
        sleep(uniform(0.5,0.7))
            
        match_html = driver.execute_script("return document.documentElement.outerHTML")
        link = driver.current_url

        eventName = eventName_regex.findall(match_html)
        
        if not eventName: # not all good, record error info
            
            temp_error_dict = {'URL':link,'Event Number':i,'HTML':match_html} # can record more info as desired
            error_list.append(temp_error_dict)
        
        else: # real page, all good
                
            eventName = eventName[0].strip()
            #teamId = teamId_regex.findall(link)[0]
            dualId = dualId_regex.findall(link)[0]
            eventId = None
            
            match_data = initDataGrid_regex.findall(match_html)
            
            if not match_data: # empty record, save to check later
                
                empty_record_list.append(1)
            
            else: # data exists, scrape it
                
                empty_record_list.append(0)
                
                # remove control chars that make json.loads unhappy
                match_data = match_data[0]
                match_data = re.sub(r'[\t\n\r\\]','',match_data)
                match_data_list = json.loads(match_data)

                # note order of appended items
                for each_match in match_data_list:

                    each_match.append(eventName)
                    each_match.append(eventId)
                    each_match.append(dualId)
                    each_match.append(eventDate)

                match_Id += 1

                json.dump(match_data_list,
                            open(OUTPUT_FILE.format(match_Id),'w'))
                
        # closes new window and returns to events page to open next event
        driver.close()
        driver.switch_to.window(default_window)
        sleep(uniform(0.5,0.7))
        

In [22]:
driver.quit()

In [23]:
# Save number of individual JSON data files for later use

with open('./ScrapedMatchData/NumFiles.txt','w') as NumFile:
    NumFile.write(str(match_Id))
    

In [28]:
# Save number of NEW individual JSON data files for later use
# Note: Need to refine this tracking to reflect number of new data files actually created (not just searched for)


NumNewFiles = len(openEventIndex)

with open('./ScrapedMatchData/NumNewFiles.txt','w') as NumFile:
    NumFile.write(str(NumNewFiles))
    

In [25]:
# Save names and dates of scraped events for later use

json.dump(eventNames,
            open('./ScrapedMatchData/ScrapedEventNames.json','w'))

json.dump(eventDates,
            open('./ScrapedMatchData/ScrapedEventDates.json','w'))


In [26]:
# Save empty record info so we know to check them for updates later

### UNCOMMENT ONCE SUPPORTING EMPTY RECORD CHECKING

#json.dump(empty_record_list,
#            open('./ScrapedMatchData/EmptyRecords.json','w'))


In [27]:
# Save error info for later investigation

### UNCOMMENT ONCE SUPPORTING ERROR CHECKING

#json.dump(error_list,
#            open('./ScrapedMatchData/ScrapingErrors.json','w'))