In [1]:
import json
import requests
from bs4 import BeautifulSoup

import re

from selenium import webdriver

import pandas as pd

from random import uniform
from time import sleep

In [2]:
WRESTLING_URL = 'https://www.trackwrestling.com'

In [3]:
def cookie_parser(cookie_string):
    ''' 
    Takes in cookie_string from webpage network request headers and reformats it 
    in dictionary form for use as requests.get() cookies parameter
    '''
    
    # Create empty dictionary 
    cookie_dict = {}
    
    # Extracts key-value pairs
    cookie_keys_and_vals = cookie_string.split(';')
    
    # Separates each key and value from its pair and inserts them into cookie_dict dictionary
    for cookie in cookie_keys_and_vals:
        
        split_pair = cookie.split('=',maxsplit=1) # set maxsplit to avoid '=' in cookie value
        cookie_name = split_pair[0].strip() # strip() removes leading/ending whitespace
        
        # Look for cookie names with no values
        if len(split_pair)==2:
            cookie_value = split_pair[1].strip()
        else:
            cookie_value = ''
        
        # Adds cookie to dictionary
        cookie_dict[cookie_name] = cookie_value
        
    return(cookie_dict)

In [12]:
### Makes cookies to send in request for 2019-2020 NYS High School Boys Wrestling events table
### url: www.trackwrestling.com/seasons/Results.jsp?TIM=1580680176497&twSessionId=aipxyairxr

# raw_cookies are currently copy-pasted; look into scraping them in the future
raw_cookies = '__cfduid=d20db16b722895888e7667a900e31dfbe1579812112; _vwo_uuid_v2=DCC7266DF88C4EB2060E32684740C3214|a420f29d8e0c86e09c05da82b375d0a4; _hjid=9c0d8409-1d58-4e44-8210-90e1bc68af8e; _vwo_uuid=D444C2B39AAEFFC0BB511F3DD82A5D057; _vwo_ds=3%3Aa_0%2Ct_0%3A0%241579812112%3A42.36586452%3A%3A%3A4_0%2C3_0%3A0; _ga=GA1.2.1156555559.1579812115; __qca=P0-2030424604-1579812114847; intercom-id-p4zi0yg9=696e874c-0c6f-471e-89d1-59a32a057909; intercom-session-p4zi0yg9=; __gads=ID=c09932a97005280d:T=1579812116:S=ALNI_MYjvSlVrVU3OVdca9jRWExWIUXJYQ; _1ci_7ag23o86kjasbfd=5d12c791-41fb-11ea-ab95-0b032cabaca3; _MXBj_SURpRlk=33a46ce0-458c-3437-aae2-cf740befb847; _vis_opt_s=4%7C; _vis_opt_test_cookie=1; OX_plg=pm; CRISPKEY=63e5094317006c4bd9675e3; CRISPSUBNO=dbfd5e178002d01cf5b0ef00cdd0460b; _dd_pktn_i=C/1580688794/1580235844/hdvktpgfraalpdm7ym44e3cthbpkxq/2a3b53ff7104af07a5fc4d2fa5b3fc1979ac8135/cncabwc7/129.236.143.187; USER_SESSIONID=C745F9BB61093022BA41D3791EA82379-n5; _gid=GA1.2.726204881.1580911405; _dc_gtm_UA-38689907-1=1; _vwo_sn=1099291%3A10'
baked_cookies = cookie_parser(raw_cookies)

In [18]:
### Params for using requests.get()

# Params currently copy-pasted; look into scraping them in the future
# Known bug: TIM and/or twSessionId params vary, cannot use old values (quick copy-paste fix for now)
get_params = {
    'TIM': 1580911440115,
    'twSessionId': 'lmvgwklcwx',
    'loadBalanced': 'true',
    'gbId': 38,
    'seasonId': 1428400132
}

In [19]:
events_request = requests.get(url=WRESTLING_URL+'/seasons'+'/Results.jsp',
                              params=get_params,
                              cookies=baked_cookies)

In [20]:
# Use regex to extract contents of initDataGrid string from events page response
events_response = events_request.text
initDataGrid_regex = re.compile(r'initDataGrid\(50, true, \"(.*\]\])')
events_data = initDataGrid_regex.findall(events_response)[0]
events_data = re.sub(r'\\','',events_data)
events_data_list = json.loads(events_data)

# Unsure of what every item in each list is. First is eventId, third/fourth is date, 
# then event name, state, and lots of blank entries

In [21]:
len(events_data_list)
### Notes:
# Events page caps at 250 events, which is roughly the last 5 days. Can't keep scrolling like the indeed page number code
# Maybe need to send specific search params by date?
# unrelated: found creation of TIM parameter in code: TIM="+Math.floor(Math.random()*100000)

250

In [22]:
events_data_list

[['5186408132',
  '1',
  'I',
  '20200205',
  '20200205',
  'Multi-Meet at Edgemont 2/5/20',
  'NY',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '0',
  '0',
  '20200205',
  None,
  None,
  None,
  None,
  '',
  '',
  'http://www.trackwrestling.com/tw/uploads/SS-6173527009-New_Panther_Logo_NO_BORDER_(1).png',
  '#000080',
  '#ffffff',
  '',
  ''],
 ['5260473132',
  '0',
  '',
  '20200205',
  '20200205',
  '',
  'NY',
  '1',
  '778181135',
  'Rome Free Academy',
  'NY',
  '30',
  '778130135',
  'Oneida Sr HS',
  'NY',
  '28',
  '0',
  '0',
  '20200205',
  None,
  '',
  '',
  '',
  None,
  None,
  '',
  '#000080',
  '#FFFFFF',
  '#000080',
  '#ffffff'],
 ['5002005132',
  '1',
  'D',
  '20200205',
  '20200205',
  'Tri Meet vs Stepinac & Iona',
  'NY',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '0',
  '0',
  '20200205',
  None,
  None,
  None,
  None,
  '',
  '',
  '',
  '#000080',
  '#FFFFFF',
  '',
  ''],
 ['5271005132',
  '0',
  '',
  '20200204',
  '20200204',
  

In [23]:
### Remote browsing through selenium

driver = webdriver.Firefox()
driver.implicitly_wait(uniform(0.1,0.5)) # force random wait time between 100-500 ms
driver.get(WRESTLING_URL)

In [None]:
### The following cells navigate from trackwrestling landing page to New York State meets pages

In [24]:
driver.execute_script("displayMenu('subMenu-browse', 'left', 1, 'Browse')")

In [25]:
driver.execute_script("displayMenu('subMenu-seasons', 'left', 2, 'Seasons')")

In [27]:
driver.execute_script("displayMenu('subMenu-seasonOrganizeBy_1428400132', 'left', 3, '2019-20 High School Boys')")

In [28]:
driver.execute_script("gotoBuildURLPage('seasons/LoadBalance.jsp?seasonId=1428400132&gbId=38&uname=&pword=')")

In [30]:
driver.execute_script("openEvent(0)") # Doesn't work. Need to switch to in-page Results frame to send this command

JavascriptException: Message: ReferenceError: openEvent is not defined


In [None]:
'''
Using click-thru URL and parameters:

./seasons/EventMatches.jsp?TIM=1580219027557&twSessionId=hkftajyxbl&eventId=5179344132&teamId=777834135

Primary issue: getting 'could not determine which season' error pop-up, don't know how to pass along that info
- Sending Cookies?
'''

# FOR PSAL: Maspeth Mats


# Just first team for now
team_Ids = [777834135]

for Id in team_Ids:
    request_params = {
        'TIM': 1580219027557,
        'twSessionId': 'hkftajyxbl',
        'eventId': 5179344132,
        'teamId': Id}
        
    #job_descs = []
    #for i in range(NUM_PAGES):
        # Step 1, get the search page results
        #request_params.update({'start': i * 10})
    wrestling_response = requests.get(url=WRESTLING_URL + '/seasons' + '/EventMatches.jsp',
                                       params=request_params)
    
    if wrestling_response.status_code != 200:
            print('non-200 response for search page, skipping')
            continue
    
    wrestling_search_html = wrestling_response.text
    parsed_wrestling_searches = BeautifulSoup(wrestling_search_html, 'html.parser')
    
parsed_wrestling_searches

In [None]:
# All team_Id paramaters for in-page direct link URL (PSAL: Maspeth Mats)
#team_Ids = [1258447138, 194652138, 1258449138, 1258451138, 1258453138, 1258458138, 1258460138, 1258462138,
#            1258445138, 1258463138, 1258466138, 1258467138, 1258469138, 1258470138,1258473138,1258477138,
#            1258474138]

In [None]:
'''
Using in-page direct link URL and parameters:

./tw/seasons/LoadBalance.jsp?seasonId=1428400132&pageName=EventMatches.jsp&teamId=1258447138

Primary issue: this link results in redirect(?) to different URL and I don't know how to deal with that
'''
# FOR PSAL: Maspeth Mats

# Just first team for now
team_Ids = [1258447138]

for Id in team_Ids:
    request_params = {
        'seasonId': 1428400132,
        'pageName': 'EventMatches.jsp',
        'teamId': Id}
        
    #job_descs = []
    #for i in range(NUM_PAGES):
        # Step 1, get the search page results
        #request_params.update({'start': i * 10})
    wrestling_response = requests.get(url=WRESTLING_URL + '/tw' + '/seasons' + '/LoadBalance.jsp',
                                       params=request_params)
    
    if wrestling_response.status_code != 200:
            print('non-200 response for search page, skipping')
            continue
    
    wrestling_search_html = wrestling_response.text
    parsed_wrestling_searches = BeautifulSoup(wrestling_search_html, 'html.parser')
    
parsed_wrestling_searches