In [1]:
from bs4 import BeautifulSoup
import configparser
import requests
import pandas as pd
import pickle
from constants_funcs import get_session, int_or_float_or_str
import json
import numpy as np
import time
with open('constants.json', 'r') as d:
    constants = json.load(d)

In [2]:
import queue, time, urllib.request
from threading import Thread
def perform_web_requests(things, no_workers, ovr_results: list, error_results: list, func):
    st = time.time()
    class Worker(Thread):
        reqs_done = 0
        def __init__(self, request_queue):
            Thread.__init__(self)
            self.queue = request_queue
            self.results = []

        def run(self):
            while True:
                content = self.queue.get()
                if content == "":
                    break
                try:
                    resp = func(content)
                    self.results.append(resp)
                    ovr_results.append(resp)
                    Worker.reqs_done += 1
                except:
                    print(f'broke on {content}')
                    error_results.append(content)
                if Worker.reqs_done % 1000 == 0:
                    print(f'{Worker.reqs_done} complete')
                self.queue.task_done()


    # Create queue and add addresses
    q = queue.Queue()
    for thing in things:
        q.put(thing)

    # Workers keep working till they receive an empty string
    for _ in range(no_workers):
        q.put("")

    # Create workers and add tot the queue
    workers = []
    for _ in range(no_workers):
        worker = Worker(q)
        worker.start()
        workers.append(worker)
    # Join workers to wait till they finished
    for worker in workers:
        worker.join()

    # Combine results from all workers
    r = []
    for worker in workers:
        r.extend(worker.results)
    print(f'time with {len(r)} requests, {no_workers} workers: {time.time()-st}, errors: {len(error_results)}')
    return r, error_results

In [3]:
sess = get_session()
sess

using existing login session


<requests.sessions.Session at 0x7fc580529940>

In [4]:
with open('rundles_by_season.json', 'r') as fp:
    all_rundles_by_season = json.load(fp)
all_rundles_by_season
with open('run1/run1.1/player_profile_old_ids.json', 'r') as fp:
    pp = json.load(fp)
with open('player_profiles.json', 'r') as fp:
    player_profiles = json.load(fp)
# for pl in player_profiles.keys():
#     if 'name' in player_profiles[pl].keys(): player_profiles[pl]['username'] = player_profiles[pl]['name']
# with open('player_profiles.json', 'w') as fp:
#     json.dump(player_profiles, fp)
with open('pp_by_username.json', 'r') as fp:
    by_username = json.load(fp)

In [5]:
def rundle_md_url(rundle: list, season: int, md: int):
    zfilled_md = str(md).zfill(2)
    if season <= 38:
        return f'{constants["LLHEADER"]}/ll{season}/questions/md{zfilled_md}.shtml'
    elif season <= 51:
        url_insert = (rundle[0] if rundle[1] == 'League' else f'{rundle[0]}_{rundle[1]}') + (
            'results' if not (season == 51 and md >= 7) else '')
        return f'{constants["LLHEADER"]}/ll{season}/questions/ll{season}md{zfilled_md}{url_insert}.shtml'
    else:
        base_url = f'{constants["LLHEADER"]}/match.php?{season}&{md}&'
        if rundle[0] == 'Championship':
            return base_url + rundle[0]
        if rundle[2] == 0:
            return base_url + f'{rundle[0]}_{rundle[1]}'
        return base_url + f'{rundle[0]}_{rundle[1]}_Div_{rundle[2]}'
def format_match_result(p1_input, p2_input):
    p1_result = p1_input[0]
    p1_forfeit = p1_input[2] == 'F'
    p2_result = p2_input[0]
    p2_forfeit = p2_input[2] == 'F'
    if p1_forfeit:
        p1_result = 'F'
        p2_result = 'F' if p2_forfeit else 'W'
    elif p2_forfeit:
        p2_result = 'F'
        p1_result = 'W'
    elif p1_result == p2_result:
        p1_result = 'T'
        p2_result = 'T'
    else:
        p1_result = 'W' if int(p1_result) > int(p2_result) else 'L'
        p2_result = 'L' if p1_result == 'W' else 'W'
    return p1_result, p2_result
def build_base_player(id) -> dict:
    # try:
    url = constants['LLHEADER'] + f'/profiles.php?{str(id)}'
    main_data = sess.get(url)
    return build_player_from_soup(main_data)
    # except:
    #     print(f'error on {id}')
    #     return None
def build_player_from_soup(data):
    id = int_or_float_or_str('?'.join(data.url.split('?')[1:]))
    soup = BeautifulSoup(data.text, 'html.parser')
    ret = {'id': id, 'deceased': False}
    a, b = 'This player is inactive.' in soup.text, 'This is not an active player account.' in soup.text
    if a or b:
        ret['current_status'] = 'deactivated'
        ret['username'] = ret['id'] if b else soup.find('div', attrs={'class': 'namediv'}).text.strip()
    else:
        box = soup.find('div', attrs={'class': 'topcont'})
        if '\t\tPassed away' in box.text:
            ret['current_status'] = 'deactivated'
            ret['deceased'] = True
        else:
            ret['current_status'] = box['class'][-1]
        branch_comp = soup.findAll('div', attrs={'class': 'demog_row'})[1]
        ret['branch_id'] = int(branch_comp.find('a')['href'].split('?')[-1])
        ret['branch_name'] = branch_comp.text.strip().split(': ')[1]
        genloc = soup.findAll('div', attrs={'class': 'demog_row'})[0].text.strip().split('\n\n')
        ret['college'] = (soup.find(text='College:').parent.parent.contents[2].strip()
            if soup.find(text='College:') else None)
        ret['gender'] = (soup.find(text='Gender:').parent.parent.text.split(':')[1].strip()
            if soup.find(text='Gender:') else None)
        ret['location'] = (soup.find(text='Location:').parent.parent.text.split(':')[1].strip()
            if soup.find(text='Location:') else None)
        ret['referrer'] = (int(soup.find(text='Referrer:').parent.parent.find('a')['href'].split('?')[-1])
            if soup.find(text='Referrer:') else 0)
        ret['name'] = soup.find('h1', attrs={'class': 'namecss'}).text
        ret['league'] = soup.find('div', attrs={'class': 'leaguelogodiv'}).text.strip()
    return ret
def build_inverse():
    bu = dict()
    for key in player_profiles.keys():
        if 'name' in player_profiles[key].keys: bu[key]['name'] = bu[key]
        else: bu[key]['username'] = bu[key]

In [8]:
fr = build_base_player(2352)
fr

{'id': 2352,
 'deceased': False,
 'current_status': 'topcont',
 'branch_id': 2352,
 'branch_name': 'Friedman',
 'college': 'Univ. of Cincinnati, Miami Univ.',
 'gender': 'Male',
 'location': 'Cleveland, Ohio',
 'referrer': 21064,
 'name': 'Friedman',
 'league': 'Central'}

In [41]:
# helper functions: take in soup, return list of formatted matches.
def pre_38(soup, rundle, season, md):
    header_text = 'Rundle ' + (rundle[0] if rundle[1] == 'League' or season <= 33 else rundle[0] + ' ' + rundle[1])
    matches = soup.find(text=header_text).parent.next_element.next_element.find_all('div', {'class': 'sbGame'})
    # print(matches)
    match_participation = []
    matches_history = []
    player_answer_history = []
    for match in matches:
        match_id = match['onclick'].split('/')[-1].split('.')[0]
        matches_history.append([match_id, *rundle, season, md])
        current_match_url = match['onclick'].split("'")[1]
        try:
            match_soup = BeautifulSoup(sess.get(constants['LLHEADER'] + current_match_url).text, 'html.parser')
            rows = match_soup.find('td').find('center').find('tr').find('table').find_all('tr')
            match_results = [None, None]
            for idx, player in enumerate(rows[1:3]):
                tds = player.find_all('td')
                player_id = tds[0].find('a')['href'].split('/')[-1].split('.')[0]
                tot_pts, tot_correct = list(map(lambda i: int_or_float_or_str(i[0]), tds[7].text.split('(')))
                fft = tot_correct == 'F'
                q_correct = [None]*6 if tot_correct == 'F' else list(
                    map(lambda td: td.find('div')['class'][0] == 'green', tds[1:7]))
                def_given = list(map(lambda td: int('1' if td.text == 'w' and season == 36 else td.text), tds[1:7]))
                if sum(def_given) == 0: # then you were forfeited against.
                    def_given = [None]*6
                match_results[idx] = {'id': player_id, 'fft': fft, 'tot_pts': tot_pts}
                for qnum in range(6):
                    player_answer_history.append([
                        match_id, *rundle, season, md, qnum+1,
                        player_id, q_correct[qnum], def_given[qnum]
                    ])
            p1_res, p2_res = '', ''
            if match_results[0]['fft']:
                p1_res = 'F'
                p2_res = 'F' if match_results[1]['fft'] else 'W'
            elif match_results[1]['fft']:
                p1_res, p2_res = 'W', 'F'
            elif match_results[0]['tot_pts'] == match_results[1]['tot_pts']:
                p1_res, p2_res = 'T', 'T'
            elif match_results[0]['tot_pts'] > match_results[1]['tot_pts']:
                p1_res, p2_res = 'W', 'L'
            else:
                p1_res, p2_res = 'L', 'W'
            match_participation.append([
                match_id, season, md, *rundle,
                match_results[0]['id'], p1_res
            ])
            match_participation.append([
                match_id, season, md, *rundle,
                match_results[1]['id'], p2_res
            ])
        except:
            print(f'failed on LL{season}-{md} match {match_id}')
        # break
    # print(matches_history, match_participation, player_answer_history)
    return matches_history, match_participation, player_answer_history
# for season in range(30, 39):
def main_pre_38(content):
    rundle, season, md = content['rundle'], content['season'], content['md']
    url = rundle_md_url(rundle, season, md)
    gamepage = sess.get(url)
    html_text = gamepage.text
    soup = BeautifulSoup(html_text, 'html.parser')
    ret = dict()
    ret['matches_history'], ret['match_participation'], ret['player_answer_history'] = pre_38(soup, rundle, season, md)
    return ret
main_pre_38({'rundle': ['R', 'League', 0], 'season': 37, 'md': 1})

failed on LL37-1 match 38539601


{'matches_history': [['38539601', 'R', 'League', 0, 37, 1],
  ['39039901', 'R', 'League', 0, 37, 1],
  ['39139201', 'R', 'League', 0, 37, 1],
  ['38738801', 'R', 'League', 0, 37, 1],
  ['39839301', 'R', 'League', 0, 37, 1],
  ['38638401', 'R', 'League', 0, 37, 1],
  ['38939501', 'R', 'League', 0, 37, 1],
  ['39739401', 'R', 'League', 0, 37, 1]],
 'match_participation': [['39039901',
   37,
   1,
   'R',
   'League',
   0,
   'mendelsohnj',
   'L'],
  ['39039901', 37, 1, 'R', 'League', 0, 'lefskyd', 'W'],
  ['39139201', 37, 1, 'R', 'League', 0, 'murrells', 'L'],
  ['39139201', 37, 1, 'R', 'League', 0, 'pliskag', 'W'],
  ['38738801', 37, 1, 'R', 'League', 0, 'huntt', 'L'],
  ['38738801', 37, 1, 'R', 'League', 0, 'leer', 'W'],
  ['39839301', 37, 1, 'R', 'League', 0, 'burlinm2', 'W'],
  ['39839301', 37, 1, 'R', 'League', 0, 'pontiusk', 'L'],
  ['38638401', 37, 1, 'R', 'League', 0, 'glassmand', 'W'],
  ['38638401', 37, 1, 'R', 'League', 0, 'chengc', 'L'],
  ['38939501', 37, 1, 'R', 'League'

In [40]:
for season in [37]:
    season_prep_data = []
    cur_rundles = all_rundles_by_season[str(season)]['rundles']
    ovr_results, err_results = [], []
    for md in range(1, 21):
        for rundle in cur_rundles:
            # prep_data.append({'rundle': rundle, 'season': season, 'md': md})
            season_prep_data.append({'rundle': rundle, 'season': season, 'md': md})
    try:
        tot, err = perform_web_requests(season_prep_data, 8, ovr_results, err_results, main_pre_38)
        with open(f'./FULL_DATA/LL{season}/results.json', 'w') as fp:
            json.dump(tot, fp)
    except:
        print(f'season {season} failed')

failed on LL37-1 match 38539601
failed on LL37-1 match 37935001
failed on LL37-6 match 37726706
failed on LL37-6 match 39939206
failed on LL37-6 match 39839706
failed on LL37-7 match 37734007
failed on LL37-6 match 37904506
failed on LL37-6 match 36909306
failed on LL37-6 match 38233606
failed on LL37-7 match 37408307
failed on LL37-7 match 39938707
time with 140 requests, 8 workers: 68.70735001564026, errors: 0


In [42]:
def from_39_to_51(soup, rundle, season, md):
    match_urls = list(map(lambda tr: tr.contents[-1].find('a')['href'], soup.find('table', {'class': 'tblResults'}).find_all('tr')))
    match_participation = []
    matches_history = []
    player_answer_history = []
    for url in match_urls:
        match_id = url.split('/')[-1].split('.')[0]
        matches_history.append([match_id, *rundle, season, md])
        # match_soup = BeautifulSoup(sess.get(constants['LLHEADER'] + url).text, 'html.parser')
        # rows = match_soup.find('td').find('center').find('tr').find('table').find_all('tr')
        try:
            match_soup = BeautifulSoup(sess.get(constants['LLHEADER'] + url).text, 'html.parser')
            rows = match_soup.find('td').find('center').find('tr').find('table').find_all('tr') 
            match_results = [None, None]
            for idx, player in enumerate(rows[1:3]):
                tds = player.find_all('td')
                player_id = tds[0].find('a')['href'].split('/')[-1].split('.')[0]
                tot_pts, tot_correct = list(map(lambda i: int_or_float_or_str(i[0]), tds[7].text.split('(')))
                fft = tot_correct == 'F'
                q_correct = [None]*6 if tot_correct == 'F' else list(
                    map(lambda td: 'Green' in td['class'][0] or
                    'green' in td.find('div')['class'][0], tds[1:7]))
                def_given = list(map(lambda td: int(
                    td.text if str(td.text).isnumeric() else 0), tds[1:7]))
                if sum(def_given) == 0: # then you were forfeited against.
                    def_given = [None]*6
                match_results[idx] = {'id': player_id, 'fft': fft, 'tot_pts': tot_pts}
                for qnum in range(6):
                    player_answer_history.append([
                        match_id, *rundle, season, md, qnum+1,
                        player_id, q_correct[qnum], def_given[qnum]
                    ])
            p1_res, p2_res = '', ''
            if match_results[0]['fft']:
                p1_res = 'F'
                p2_res = 'F' if match_results[1]['fft'] else 'W'
            elif match_results[1]['fft']:
                p1_res, p2_res = 'W', 'F'
            elif match_results[0]['tot_pts'] == match_results[1]['tot_pts']:
                p1_res, p2_res = 'T', 'T'
            elif match_results[0]['tot_pts'] > match_results[1]['tot_pts']:
                p1_res, p2_res = 'W', 'L'
            else:
                p1_res, p2_res = 'L', 'W'
            match_participation.append([
                match_id, season, md, *rundle,
                match_results[0]['id'], p1_res
            ])
            match_participation.append([
                match_id, season, md, *rundle,
                match_results[1]['id'], p2_res
            ])
        except:
            print(f'failed on LL{season}-{md} match {match_id}')
        # break
    # print(matches_history, match_participation, player_answer_history)
    return matches_history, match_participation, player_answer_history

def main_39_51(content):
    rundle, season, md = content['rundle'], content['season'], content['md']
    url = rundle_md_url(rundle, season, md)
    gamepage = sess.get(url)
    html_text = gamepage.text
    soup = BeautifulSoup(html_text, 'html.parser')
    ret = dict()
    ret['matches_history'], ret['match_participation'], ret['player_answer_history'] = from_39_to_51(soup, rundle, season, md)
    return ret
main_39_51({'rundle': ['R', 'League', 0], 'season': 40, 'md': 1})

{'matches_history': [['44847201', 'R', 'League', 0, 40, 1],
  ['46045701', 'R', 'League', 0, 40, 1],
  ['46847801', 'R', 'League', 0, 40, 1],
  ['46945901', 'R', 'League', 0, 40, 1],
  ['46147501', 'R', 'League', 0, 40, 1],
  ['45646201', 'R', 'League', 0, 40, 1],
  ['47646601', 'R', 'League', 0, 40, 1],
  ['47146701', 'R', 'League', 0, 40, 1],
  ['47045401', 'R', 'League', 0, 40, 1]],
 'match_participation': [['44847201', 40, 1, 'R', 'League', 0, 'berlinb', 'L'],
  ['44847201', 40, 1, 'R', 'League', 0, 'sandozm', 'W'],
  ['46045701', 40, 1, 'R', 'League', 0, 'gwynne', 'L'],
  ['46045701', 40, 1, 'R', 'League', 0, 'dobsont', 'W'],
  ['46847801', 40, 1, 'R', 'League', 0, 'murphym2', 'W'],
  ['46847801', 40, 1, 'R', 'League', 0, 'weitzmanm', 'F'],
  ['46945901', 40, 1, 'R', 'League', 0, 'naliboffl', 'L'],
  ['46945901', 40, 1, 'R', 'League', 0, 'fogertya', 'W'],
  ['46147501', 40, 1, 'R', 'League', 0, 'jastrzebskis', 'W'],
  ['46147501', 40, 1, 'R', 'League', 0, 'swaminathann', 'L'],
  [

In [121]:
for season in range(49, 52):
    season_prep_data = []
    cur_rundles = all_rundles_by_season[str(season)]['rundles']
    ovr_results = []
    for md in range(1, 26 if season < 51 else 7):
        for rundle in cur_rundles:
            # prep_data.append({'rundle': rundle, 'season': season, 'md': md})
            season_prep_data.append({'rundle': rundle, 'season': season, 'md': md})
    try:
        tot = perform_web_requests(season_prep_data, 8, ovr_results, main_39_51)
        with open(f'./FULL_DATA/LL{season}/results.json', 'w') as fp:
            json.dump(tot, fp)
    except:
        print(f'season {season} failed')

time with 525 requests, 8 workers: 603.8545551300049
time with 525 requests, 8 workers: 715.4193091392517
time with 168 requests, 8 workers: 203.6491141319275


In [169]:
def special_51(soup: BeautifulSoup, rundle, season, md):
    matches = soup.find('table', {'class': 'tblResults2'})
    to_take = len(matches)*2
    full_questions = soup.find('table', {'class': 'sortable'}).find('tbody').find_all('tr')[:to_take]
    match_participation = []
    matches_history = []
    player_answer_history = []
    temp_players = dict()
    for player_row in full_questions:
        tds = player_row.find_all('td')
        q_tds = tds[:6]
        player_username = '_'.join(tds[7].text.strip().split())
        fft = 'Black' in q_tds[0]['style']
        q_correct = [None]*6 if fft else list(map(lambda q: 'Green' in q['style'], q_tds))
        def_given = list(map(lambda q:
            int(q.text) if q.text.isnumeric() else None, q_tds))
        tot_correct, tot_pts = 'F', 0
        if not fft:
            tot_correct = sum(q_correct)
            if def_given[0] is not None: tot_pts = sum(np.multiply(q_correct, def_given))
            else: tot_pts = round(1.5*tot_correct)
        temp_players[player_username] = {'id': player_username,
            'fft': fft, 'q_correct': q_correct, 'def_given': def_given,
            'tot_correct': tot_correct, 'tot_pts': tot_pts}

    for match in matches:
        p1_id, p2_id = '_'.join(
            match.contents[1].text.strip().split()), '_'.join(
            match.contents[3].text.strip().split())
        match_id = match.contents[2].find('a')['href'].split('/')[-1].split('.')[0]
        matches_history.append([match_id, *rundle, season, md])
        p1, p2 = temp_players[p1_id], temp_players[p2_id]
        if p1['fft']:
            p1['result'] = 'F'
            p2['result'] = 'F' if p2['fft'] else 'W'
        elif p2['fft']: p1['result'], p2['result'] = 'W', 'F'
        elif p1['tot_pts'] == p2['tot_pts']: p1['result'], p2['result'] = 'T', 'T'
        elif p1['tot_pts'] > p2['tot_pts']: p1['result'], p2['result'] = 'W', 'L'
        else: p1['result'], p2['result'] = 'L', 'W'
        match_participation.append([
            match_id, season, md, *rundle,
            p1['id'], p1['result']
        ])
        match_participation.append([
            match_id, season, md, *rundle,
            p2['id'], p2['result']
        ])
        for qnum in range(6):
            player_answer_history.append([
                match_id, *rundle, season, md, qnum+1,
                p1['id'], p1['q_correct'][qnum], p1['def_given'][qnum]
            ])
            player_answer_history.append([
                match_id, *rundle, season, md, qnum+1,
                p2['id'], p2['q_correct'][qnum], p2['def_given'][qnum]
            ])
    return matches_history, match_participation, player_answer_history

def main_special_51(content):
    rundle, season, md = content['rundle'], content['season'], content['md']
    url = rundle_md_url(rundle, season, md)
    gamepage = sess.get(url)
    html_text = gamepage.text
    soup = BeautifulSoup(html_text, 'html.parser')
    ret = dict()
    ret['matches_history'], ret['match_participation'], ret['player_answer_history'] = special_51(soup, rundle, season, md)
    return ret
# main_special_51({'rundle': ['E', 'Northeast', 0], 'season': 51, 'md': 7})

In [167]:
# remainder of season 51
with open('./FULL_DATA/LL51/results.json', 'r') as fp:
    cur_51 = json.load(fp)
print(len(cur_51))
season_prep_data = []
cur_rundles = all_rundles_by_season['51']['rundles']
ovr_results = []
for md in range(7, 26):
    for rundle in cur_rundles:
        # prep_data.append({'rundle': rundle, 'season': season, 'md': md})
        season_prep_data.append({'rundle': rundle, 'season': season, 'md': md})
try:
    tot = perform_web_requests(season_prep_data, 8, ovr_results, main_special_51)
    cur_51.extend(tot)
    with open('./FULL_DATA/LL51/results.json', 'w') as fp:
        json.dump(cur_51, fp)
except:
    print('season 51 failed')

168
broke on {'rundle': ['C', 'Coastal', 0], 'season': 51, 'md': 17}
time with 531 requests, 8 workers: 32.81769013404846


In [16]:
with open('alt_ids.json', 'r') as rp:
    change_id = json.load(rp)
with open('player_profiles.json', 'r') as fp:
    player_profiles = json.load(fp)
def post_52(soup: BeautifulSoup, rundle, season, md):
    global player_profiles
    matches = soup.find('table', {'class': 'gamelinetbl'}).find_all('tr')
    is_jacobs_s = 'Jacobs S.' in soup.find('table', {'class': 'gamelinetbl'}).text
    missing_one_match = (
        season == 75 and rundle[0] == 'E' and rundle[1] == 'Central' and rundle[2] == 2 and (md < 10 or md > 23)) or (
        season == 76 and rundle[0] == 'C' and rundle[1] == 'Citadel' and rundle[2] == 2) or (
        season == 75 and rundle[0] == 'D' and rundle[1] == 'Maelstrom' and rundle[2] == 1 and md in [8, 9, 10, 24, 25]
        )
    to_take = (len(matches) + int(missing_one_match))*2 - int(is_jacobs_s)
    if to_take > 0 and season != 73 and md != 23:
        full_questions = soup.find('table', {'class': 'sortable'}).find('tbody').find_all('tr')[:to_take]
    else: full_questions = soup.find('table', {'class': 'sortable'}).find('tbody').find_all('tr')
    match_participation = []
    matches_history = []
    player_answer_history = []
    temp_players = dict()
    for player_row in full_questions:
        tds = player_row.find_all('td')
        q_tds = tds[:6]
        # player_temp_username = str(tds[7].find('img')['title']).lower()
        player_temp_id = str(tds[7].find('a')['href'].split('?')[-1]) # just using the player ID, not username.
        # if player_temp_username == 'o' or player_temp_username.endswith('.') or player_temp_username == 'd':
        #     prof_url = constants['LLHEADER'] + tds[7].find('a')['href']
        #     prof_soup = BeautifulSoup(sess.get(prof_url).text, 'html.parser')
        #     player_temp_username = prof_soup.find('h1', {'class': 'namecss'}).text.strip().replace("'", '').lower()
        # if player_temp_username in change_id.keys():
        #     player_temp_username = change_id[player_temp_username]
        fft = 'F' in q_tds[0]['class'][0]
        q_correct = [None]*6 if fft else list(map(
            lambda q: False if (
                q['class'][0][1] == 'F' and season == 91 and md == 18 and rundle[1] in ['Glacier', 'Ravenna']
                ) else bool(int(q['class'][0][1])), q_tds))
        def_given = list(map(lambda q:
            int(q.text) if q.text.isnumeric() else (
                0 if q.text.lower() in ['', 'o'] else (
                2 if q.text == '`' and season in [62, 63, 64, 65] else None)
            ), q_tds))
        tot_correct, tot_pts = 'F', 0
        if not fft:
            tot_correct = sum(q_correct)
            if def_given[0] is not None: tot_pts = sum(np.multiply(q_correct, def_given))
            else: tot_pts = round(1.5*tot_correct)
        # player_temp_username = '_'.join(player_temp_username.split())
        # print(player_temp_id)
        if player_temp_id not in player_profiles.keys():
            player_profiles[player_temp_id] = build_base_player(player_temp_id)
            build_inverse()
        this_pp = player_profiles[player_temp_id]
        player_temp_username = str(this_pp['id'])
        # if 'name' in this_pp.keys(): player_temp_username = this_pp['name']
        # else: player_temp_username = this_pp['username']
        if player_temp_username.lower() == 'o\'brienm': player_temp_username = '_'.join(player_temp_username.replace("'", '_').split()).lower()
        elif player_temp_username.lower() == 'o\'brienp' and rundle[1] == 'Sugarloaf': player_temp_username = 'o_brienp'
        elif player_temp_username.lower() == 'o\'neillj' and rundle[1] not in ['Central', 'Seneca']: player_temp_username = 'o_neillj'
        elif player_temp_username.lower() == 'obrienj' and rundle[1] == 'Xanadu': player_temp_username = '45499'
        elif player_temp_username.lower() == 'oconnors' and rundle[1] not in ['Juniper', 'Magnolia']: player_temp_username = '34162'
        else: player_temp_username = '_'.join(player_temp_username.replace("'", '').split()).lower()
        temp_players[player_temp_username] = {'key': player_temp_username,
            'fft': fft, 'q_correct': q_correct, 'def_given': def_given,
            'tot_correct': tot_correct, 'tot_pts': tot_pts}
    # print(temp_players.keys())
    # print(temp_players.keys(), set(map(lambda m: str(m[6]).lower(), match_participation)))

    for match in matches:
        # p1_id, p2_id = '_'.join(
        #     match.contents[3].text.strip().split()).lower(), '_'.join(
        #     match.contents[7].text.strip().split()).lower()
        p1_key = '_'.join('?'.join(str(match.contents[1].find('a')['href']).split('?')[1:]).lower().split()).strip()
        p2_key = '_'.join('?'.join(str(match.contents[-2].find('a')['href']).split('?')[1:]).lower().split()).strip()
        # print(p1_key, p2_key)
        # if p1_key.isnumeric() and p1_key != '34162': p1_key = '_'.join(player_profiles[p1_key]['username'].split()).lower()
        # if p2_key.isnumeric() and p2_key != '34162': p2_key = '_'.join(player_profiles[p2_key]['username'].split()).lower()
        # print(p1_key, p2_key)
        if rundle[1] == 'Skyline':
            if p1_key == 'obrienm': p1_key = "o_brienm"
            elif p2_key == 'obrienm': p2_key = "o_brienm"
        if 'jacobs_s.' in [p1_key, p2_key]: # these matches just don't exist for whatever reason. Jacobs S. is an enigma
            continue
        elif season == 75 and 'oneillj' in [p1_key, p2_key]: # these also dont work
            continue
        p1_key, p2_key = p1_key.replace("'", ''), p2_key.replace("'", '')
        # if p1_key not in temp_players.keys():
        #     if p1_key in change_id.keys():
        #         p1_key = change_id[p1_key]
        #     else:
        #         p1_key = BeautifulSoup(sess.get(constants["LLHEADER"] + f'/profiles.php?{p1_key}').text,
        #             'html.parser').find('h1', {'class': 'namecss'}).text.strip().lower()
        # if p2_key not in temp_players.keys():
        #     if p2_key in change_id.keys():
        #         p2_key = change_id[p2_key]
        #     else: p2_key = BeautifulSoup(sess.get(constants["LLHEADER"] + f'/profiles.php?{p2_key}').text,
        #             'html.parser').find('h1', {'class': 'namecss'}).text.strip().lower()
        if p1_key == 'oneillj' and rundle[1] not in ['Central', 'Seneca']: p1_key = 'o_neillj'
        elif p2_key == 'oneillj' and rundle[1] not in ['Central', 'Seneca']: p2_key = 'o_neillj'
        if p1_key == 'obrienp' and rundle[1] == 'Sugarloaf': p1_key = 'o_brienp'
        elif p2_key == 'obrienp' and rundle[1] == 'Sugarloaf': p2_key = 'o_brienp'
        if p1_key == 'oconnors' and rundle[1] not in ['Juniper', 'Magnolia']: p1_key = '34162'
        elif p2_key == 'oconnors' and rundle[1] not in ['Juniper', 'Magnolia']: p2_key = '34162'
        if p1_key == 'o\'brienj' and rundle[1] == 'Xanadu': p1_key = '45499'
        elif p2_key == 'o\'brienj' and rundle[1] == 'Xanadu': p1_key = '45499'
        match_id = match.contents[5].find('a')['href'].split('.php?id=')[1]
        matches_history.append([match_id, *rundle, season, md])
        p1_key, p2_key = p1_key.replace("'", ''), p2_key.replace("'", '')
        p1, p2 = temp_players[p1_key], temp_players[p2_key]
        # there seem to have been actually 2-3 oconnors's.
        p1['id'], p2['id'] = p1_key.replace("'", ''), p2_key.replace("'", '')
        if p1['fft']:
            p1['result'] = 'F'
            p2['result'] = 'F' if p2['fft'] else 'W'
        elif p2['fft']: p1['result'], p2['result'] = 'W', 'F'
        elif p1['tot_pts'] == p2['tot_pts']: p1['result'], p2['result'] = 'T', 'T'
        elif p1['tot_pts'] > p2['tot_pts']: p1['result'], p2['result'] = 'W', 'L'
        else: p1['result'], p2['result'] = 'L', 'W'
        match_participation.append([
            match_id, season, md, *rundle,
            p1['id'], p1['result']
        ])
        match_participation.append([
            match_id, season, md, *rundle,
            p2['id'], p2['result']
        ])
        for qnum in range(6):
            player_answer_history.append([
                match_id, *rundle, season, md, qnum+1,
                p1['id'], p1['q_correct'][qnum], p1['def_given'][qnum]
            ])
            player_answer_history.append([
                match_id, *rundle, season, md, qnum+1,
                p2['id'], p2['q_correct'][qnum], p2['def_given'][qnum]
            ])
    matches_not_found = set(temp_players.keys()).difference(set(map(lambda m: str(m[6]).lower(), match_participation)))
    for pl in matches_not_found:
        match_participation.append([
            None, season, md, *rundle,
            pl, 'U'
        ])
        for qnum in range(6):
            player_answer_history.append([
                None, *rundle, season, md, qnum+1,
                pl, temp_players[pl]['q_correct'][qnum], temp_players[pl]['def_given'][qnum]
            ])
    return matches_history, match_participation, player_answer_history

def main_post_52(content):
    rundle, season, md = content['rundle'], content['season'], content['md']
    url = rundle_md_url(rundle, season, md)
    # print(url)
    gamepage = sess.get(url)
    html_text = gamepage.text
    soup = BeautifulSoup(html_text, 'html.parser')
    ret = dict()
    ret['matches_history'], ret['match_participation'], ret['player_answer_history'] = post_52(soup, rundle, season, md)
    return ret
main_post_52({'rundle': ['D', 'Ravenna', 1], 'season': 91, 'md': 18})

{'matches_history': [['5302346', 'D', 'Ravenna', 1, 91, 18],
  ['5302347', 'D', 'Ravenna', 1, 91, 18],
  ['5302348', 'D', 'Ravenna', 1, 91, 18],
  ['5302349', 'D', 'Ravenna', 1, 91, 18],
  ['5302350', 'D', 'Ravenna', 1, 91, 18],
  ['5302351', 'D', 'Ravenna', 1, 91, 18],
  ['5302352', 'D', 'Ravenna', 1, 91, 18],
  ['5302353', 'D', 'Ravenna', 1, 91, 18],
  ['5302354', 'D', 'Ravenna', 1, 91, 18],
  ['5302355', 'D', 'Ravenna', 1, 91, 18],
  ['5302356', 'D', 'Ravenna', 1, 91, 18],
  ['5302357', 'D', 'Ravenna', 1, 91, 18],
  ['5302358', 'D', 'Ravenna', 1, 91, 18],
  ['5302359', 'D', 'Ravenna', 1, 91, 18],
  ['5302360', 'D', 'Ravenna', 1, 91, 18],
  ['5302361', 'D', 'Ravenna', 1, 91, 18],
  ['5302362', 'D', 'Ravenna', 1, 91, 18]],
 'match_participation': [['5302346', 91, 18, 'D', 'Ravenna', 1, '60038', 'T'],
  ['5302346', 91, 18, 'D', 'Ravenna', 1, '25795', 'T'],
  ['5302347', 91, 18, 'D', 'Ravenna', 1, '15084', 'L'],
  ['5302347', 91, 18, 'D', 'Ravenna', 1, '50864', 'W'],
  ['5302348', 91, 1

In [12]:
# do all R rundles in first runthrough, find names to add to id chg dict.
# do all non-R rundles md1 runthrough, find other rundle errs
with open('alt_ids.json', 'r') as rp:
    change_id = json.load(rp)
for season in range(91, 94):
    season_prep_data = []
    cur_rundles = all_rundles_by_season[str(season)]['rundles']
    # cur_rundles = list(filter(lambda rundle: rundle[0] == 'R',
    #     all_rundles_by_season[str(season)]['rundles']))
    ovr_results = []
    error_results = []
    for md in range(1, 26):
    # for md in range(1, 2):
        for rundle in cur_rundles:
            season_prep_data.append({'rundle': rundle, 'season': season, 'md': md})
    try:
        tot, err = perform_web_requests(season_prep_data, 8, ovr_results, error_results, main_post_52)
        with open(f'./FULL_DATA/LL{season}/results.json', 'w') as fp:
            json.dump(tot, fp)
        with open(f'./FULL_DATA/LL{season}/rundle_errors.json', 'w') as fp:
            json.dump(err, fp, indent=4)
        with open('player_profiles.json', 'w') as fp:
            json.dump(player_profiles, fp)
        with open('player_profiles.json', 'r') as fp:
            player_profiles = json.load(fp)
        with open('pp_by_username.json', 'w') as fp:
            json.dump(by_username, fp)
        with open('pp_by_username.json', 'r') as fp:
            by_username = json.load(fp)
        print(f'successfully completed season {season}')
    except:
        print(f'season {season} failed')
    with open('alt_ids.json', 'w') as wp:
        json.dump(change_id, wp)
    with open('alt_ids.json', 'r') as rp:
        change_id = json.load(rp)

broke on {'rundle': ['D', 'Veldt', 2], 'season': 91, 'md': 1}
1000 complete
broke on {'rundle': ['C', 'Sierra', 2], 'season': 91, 'md': 2}
broke on {'rundle': ['D', 'Sierra', 1], 'season': 91, 'md': 2}
2000 complete
broke on {'rundle': ['C', 'Laguna', 2], 'season': 91, 'md': 3}
broke on {'rundle': ['C', 'Laguna', 1], 'season': 91, 'md': 3}
broke on {'rundle': ['E', 'Keystone', 2], 'season': 91, 'md': 3}
broke on {'rundle': ['B', 'Laguna', 0], 'season': 91, 'md': 3}
broke on {'rundle': ['A', 'Laguna', 0], 'season': 91, 'md': 3}
broke on {'rundle': ['B', 'Xanadu', 0], 'season': 91, 'md': 3}
broke on {'rundle': ['A', 'Xanadu', 0], 'season': 91, 'md': 3}
broke on {'rundle': ['E', 'Woodlands', 1], 'season': 91, 'md': 3}
3000 complete
broke on {'rundle': ['E', 'Tidewater', 1], 'season': 91, 'md': 4}
broke on {'rundle': ['D', 'Mesa', 1], 'season': 91, 'md': 5}
4000 complete
broke on {'rundle': ['C', 'Blue', 2], 'season': 91, 'md': 6}
5000 complete
6000 complete
broke on {'rundle': ['E', 'Pali

In [17]:
for season in range(91, 92):
    with open(f'./FULL_DATA/LL{season}/results.json', 'r') as fp:
        seas_results = json.load(fp)
    with open(f'./FULL_DATA/LL{season}/rundle_errors.json', 'r') as fp:
        seas_errs = json.load(fp)
    print(len(seas_results), len(seas_errs))
    new_tot, new_err = perform_web_requests(seas_errs, 8, [], [], main_post_52)
    seas_results.extend(new_tot)
    print(len(seas_results))
    with open(f'./FULL_DATA/LL{season}/results.json', 'w') as fp:
        json.dump(seas_results, fp)
    with open(f'./FULL_DATA/LL{season}/rundle_errors.json', 'w') as fp:
        json.dump(new_err, fp)

21598 2
time with 2 requests, 8 workers: 0.651007890701294, errors: 0
21600


In [21]:
err

NameError: name 'err' is not defined

In [52]:
def get_rundle_matchday(rundle: list, season: int, md: int, match_history:list=[],
    player_rundle_history:list=[], match_participation:list=[], pp:dict=dict()):
    url = rundle_md_url(rundle, season, md)
    gamepage = sess.get(url)
    html_text = gamepage.text
    soup = BeautifulSoup(html_text, 'html.parser')
    if season <= 38: #LL30-38
        # header_text = 'Rundle ' + (rundle[0] if rundle[1] == 'League' or season <= 33 else rundle[0] + ' ' + rundle[1])
        # matches = soup.find(text=header_text).parent.next_element.next_element.find_all('div', {'class': 'sbGame'})
        matches = pre_38(soup, rundle, season, md)
    elif season == 51 and md >= 7: # Thorsten not at all consistent
        matches = soup.find('table', {'class': 'tblResults2'}).find_all('tr')
    elif season <= 51: #LL39-51
        matches = soup.find('table', {'class': 'tblResults'}).find_all('tr')
    else: #LL52-present
        matches = soup.find('table', {'class': 'gamelinetbl'}).find_all('tr')
    for match in matches:
        if season <= 38: match_id, p1, p2 = pre_38(match)
        elif season == 51 and md >= 7: match_id, p1, p2 = special_51(match)
        elif season <= 51: match_id, p1, p2 = classic_39_51(match)
        else: match_id, p1, p2 = post_52(match)
        if p1['input'] == False:
            p1['input'], p2['input'] = fucky(soup, str(p1['id']), str(p2['id']))
        p1_result, p2_result = format_match_result(p1['input'], p2['input'])
        if str(p1['id']).isnumeric():
            p1['id'], p2['id'] = int(p1['id']), int(p2['id'])
            p1['old_id'], p2['old_id'] = p1['id'], p2['id']
            if not p1['id'] in pp.keys():
                pp[p1['old_id']] = p1['id']
            if not p2['id'] in pp.keys():
                pp[p2['old_id']] = p2['id']
        if md == 1:
            if (season == 30 or rundle[0] == 'R'):
                if not str(p1['id']).isnumeric():
                    p1['id'], p2['id'] = '_'.join(p1['id'].split()), '_'.join(p2['id'].split())
                    if not p1['id'] in pp.keys():
                        p1['old_id'] = p1['id']
                        p1_old_url = f'/profiles/{p1["id"]}.shtml' if season <= 51 else f'/profiles.php?{p1["id"]}'
                        p1['id'] = int_or_float_or_str(sess.get(constants["LLHEADER"] + p1_old_url).url.split('?')[-1])
                        pp[p1['old_id']] = p1['id']
                    if not p2['id'] in pp.keys():
                        p2['old_id'] = p2['id']
                        p2_old_url = f'/profiles/{p2["id"]}.shtml' if season <= 51 else f'/profiles.php?{p2["id"]}'
                        p2['id'] = int_or_float_or_str(sess.get(constants["LLHEADER"] + p2_old_url).url.split('?')[-1])
                        pp[p2['old_id']] = p2['id']
            player_rundle_history.append([p1['id'], season, md, rundle[0], rundle[1], rundle[2]])
            player_rundle_history.append([p2['id'], season, md, rundle[0], rundle[1], rundle[2]])
        match_participation.append([match_id, p1['id'], p1_result])
        match_participation.append([match_id, p2['id'], p2_result])
    return player_rundle_history, match_participation, pp
# get_rundle_matchday(['D', 'Magnolia', 1], 80, 21)

In [75]:
player_rundle_history, match_participation, player_profile = [], [], dict()
with open('rundles_by_season.json', 'r') as fp:
    fr = json.load(fp)
full_rundles = []
# for key in fr.keys():
#     full_rundles.extend(fr[key]['rundles'])
# full_rundles
import time
st = time.time()
for season in range(30, 94):
    this_season_rundles = fr[str(season)]['rundles']
    for md in range(1, 21 if season <= 38 else 26):
        for rundle in this_season_rundles:
            try:
                player_rundle_history, match_participation, player_profile = get_rundle_matchday(
                    rundle, season, md, player_rundle_history, match_participation, player_profile
                )
            except:
                print(f'Failed on LL{season}, md {md}, rundle {rundle}, url: {rundle_md_url(rundle, season, md)}')
    print(f'completed LL{season}, total time: {time.time()-st}')
    st = time.time()
try:
    with open('match_participation.json', 'w') as a:
        json.dump(match_participation, a)
except:
    print('could not save match participation data')
try:
    with open('player_rundle_history.json', 'w') as a:
        json.dump(player_rundle_history, a)
except:
    print('could not save player rundle history data')
try:
    with open('player_profile_old_ids.json', 'w') as a:
        json.dump(player_profile, a)
except:
    print('could not save profile data')

completed LL30, total time: 63.94884777069092
completed LL31, total time: 7.904952049255371
completed LL32, total time: 16.15006685256958
completed LL33, total time: 16.884157180786133
completed LL34, total time: 32.05800104141235
completed LL35, total time: 33.9595308303833
completed LL36, total time: 34.343120098114014
completed LL37, total time: 32.774303913116455
completed LL38, total time: 37.54530692100525
completed LL39, total time: 36.277426958084106
completed LL40, total time: 33.288006067276
completed LL41, total time: 44.730040073394775
completed LL42, total time: 54.36037826538086
completed LL43, total time: 62.04588174819946
completed LL44, total time: 66.23294591903687
completed LL45, total time: 70.78969597816467
Failed on LL46, md 1, rundle ['R', 'Atlantic', 0], url: https://www.learnedleague.com/ll46/questions/ll46md01R_Atlanticresults.shtml
completed LL46, total time: 73.28983497619629
completed LL47, total time: 94.33732295036316
completed LL48, total time: 105.09880