In [242]:
from bs4 import BeautifulSoup
import configparser
import requests
import json
import pickle
import pandas as pd
import datetime
from constants_funcs import get_session, int_or_float_or_str
with open('constants.json', 'r') as d:
    constants = json.load(d)

In [6]:
LLHEADER = 'https://www.learnedleague.com'
LOGINFILE = LLHEADER + '/ucp.php?mode=login'
USER_DATA = LLHEADER + '/profiles/previous.php?%s'
QHIST = LLHEADER + '/profiles/qhist.php?%s'
MATCH_DATA = LLHEADER + '/match.php?%s'
ONEDAYS = LLHEADER + '/oneday'
STANDINGS = '/standings.php?'
LLSTANDINGS = LLHEADER + STANDINGS
ARUNDLE = LLSTANDINGS + '%d&A_%s'
INPUTDATA = 'logindata.ini'
TOTAL_MATCHES_PER_SEASON = 25

In [11]:
sess = get_session()
sess

creating new login session...


<requests.sessions.Session at 0x7fdf80c4dc10>

In [None]:
class Question:
    def __init__(self, text, category, season, day, qnum, correct) -> None:
        self.text = text
        self.category = category
        self.season = season
        self.day = day
        self.qnum = qnum
        self.correct = correct
full_question_history = []

In [459]:
import re
def get_mcw(qid: str):
    season, md, qnum = qid.split('-')
    if int(season) < 52: # MCW not available for LL51 and earlier.
        return '', None
    if int(season) >= 56:
        url = constants['LLHEADER']+f'/question.php?{season}&{md}&{qnum}'
        ids_html = sess.get(url)
        html_text = ids_html.text
        soup = BeautifulSoup(html_text, 'html.parser')
        txt_arr = re.split('[\n\t]+', soup.find('div', attrs={'class': 'indivqContent'}).find('div').find_all('div')[4].text)
        mcw_idx = txt_arr.index('Most Common Wrong Answer')
        mcw = txt_arr[mcw_idx + 1]
    else:
        zfilled_md = str(md).zfill(2)
        url = constants['LLHEADER']+f'/ll{season}/questions/md{zfilled_md}/md{zfilled_md}q{qnum}.php'
        ids_html = sess.get(url)
        html_text = ids_html.text
        soup = BeautifulSoup(html_text, 'html.parser')
        txt_arr = soup.find(text='Most Common Wrong Answer').parent.parent.find('div', {'class': 'indivqBRA-inner'})
        mcw = txt_arr.text.strip()
    if mcw.find('No answer') > -1 or mcw.find('None') > -1:
        return '', None
    text = ' '.join(mcw.split()[:-1])
    pct = int(mcw.split()[-1][1:-2])
    return text, pct

In [None]:
question_history = dict() # format: {season-md-qnum  (str): {


In [413]:
# Get the questions and answers for each matchday in past seasons.
#  season: int, matchday: int, qnum: int, question: str, answer: str, category: str, forfeit: float, tot_correct: int
#   A_correct: int, B_correct: int, C_correct: int, D_correct: int, E_correct: int, R_correct: int, defense: float
# }}
# a dict of dicts.
# Note: all x_correct may not be filled, early seasons often do not have D or E rundles.
def get_question(season: int, md: int) -> dict:
    if not isinstance(season, int) or not 30 <= season <= 94:
        print('valid seasons are integer 30-93')
        return
    elif not isinstance(md, int) or not 1 <= md <= 25:
        print('valid matchdays are integers 1-25 (1-20 for LL38 and earlier)')
        return
    elif season <= 38 and md > 20:
        print('LL38 and earlier only have 20 match days per season.')
        return
    # early template (seasons 52-56): https://learnedleague.com/ll55/questions/md01.php
    # late template (seasons 57-present): https://learnedleague.com/match.php?60&1
    zfilled_md = str(md).zfill(2)
    if season <= 38:
        url = f'https://www.learnedleague.com/ll{season}/questions/md{zfilled_md}.shtml'
    elif season <= 51:
        url = f"https://www.learnedleague.com/ll{season}/questions/ll{season}md{zfilled_md}questions.htm"
    elif season <= 56:
        url = f"https://www.learnedleague.com/ll{season}/questions/md{zfilled_md}.php"
    else:
        url = f"https://www.learnedleague.com/match.php?{season}&{md}"
    main_data = sess.get(url)
    html_text = main_data.text
    soup = BeautifulSoup(html_text, 'html.parser')
    if season <= 32: # LL30-32
        qs = soup.find('table').find_all('tr')[2:]
        md_date = soup.find('h1', {'class': 'matchday'}).text.split(':')[0].strip()
        if season == 31 and md >= 15:
            metrics_data = sess.get(f'https://www.learnedleague.com/ll{season}/questions/md{zfilled_md}.shtml')
            some_match = soup.find('div', {'class': 'sbGame'})['onclick'].split("'")[1]
            metrics_data = sess.get(f'https://www.learnedleague.com{some_match}')
            metrics_soup = BeautifulSoup(metrics_data.text, 'html.parser')
            qs = metrics_soup.find('table').find_all('tr')[8:14]
            for idx, q in enumerate(qs[:6]):
                if len(q.contents) == 10:
                    qnum, txt, _, _, defense, a, b, c, r, tot = list(map(lambda c: c.text, q.contents))
                    txt = ' '.join(list(map(
                    lambda t: t.text.strip() if hasattr(t, 'text') else str(t).strip(),
                    q.contents[1].contents[0:-1])))
                    ans = q.contents[1].find('span').text
                    try:
                        cat = q.contents[1].contents[0].text
                    except:
                        cat = txt.split(' - ')[0]
                else:
                    qnum, txt, ans, _, _, defense, a, b, c, r, tot = list(map(lambda c: c.text, q.contents))
                    cat = txt.split('-')[0].strip()
                qnum = int(qnum[0])
                txt = '-'.join(txt.split('-')[1:]).strip()
                defense, a, b, c, r, tot = list(map(int_or_float_or_str, [defense, a, b, c, r, tot]))
                question_history[f"{season}-{md}-{qnum}"] = {
                    'season': season, 'matchday': md, 'qnum': qnum, 'question': txt, 'answer': ans, 'category': cat,
                    'A_correct': a, 'B_correct': b, 'C_correct': c, 'R_correct': r,
                    'tot_correct': tot, 'defense': defense, 'forfeit': None, 'mcw': '', 'mcw_pct': None,
                    'date': md_date
                }
        else:
            metrics_data = sess.get(f'https://www.learnedleague.com/ll{season}/questions/md{zfilled_md}a.shtml')
            metrics_soup = BeautifulSoup(metrics_data.text, 'html.parser')
            if metrics_soup.find('table', {'class': 'ResBox'}): metrics = metrics_soup.find('table', {'class': 'ResBox'}).find_all('tr')
            else: metrics = metrics_soup.find('table', {'class': 'ind-boxATbl'}).find_all('tr')
            avg_def = list(map(lambda td: float(td.text), metrics[1].find_all('td')[1:7]))
            tot_correct = list(map(lambda td: int(td.text), metrics[3].find_all('td')[1:7]))
            A_correct = list(map(lambda td: int(td.text), metrics[4].find_all('td')[1:7]))
            B_correct = list(map(lambda td: int(td.text), metrics[5].find_all('td')[1:7]))
            C_correct = list(map(lambda td: int(td.text), metrics[6].find_all('td')[1:7]))
            R_correct = list(map(lambda td: int(td.text), metrics[7].find_all('td')[1:7]))
            for idx, q in enumerate(qs[:6]):
                row = list(map(lambda td: td.text.strip(), q.find_all('td')))
                qnum = int(row[0][:-1])
                if qnum == 1: # it does some weird shit
                    txt = row[1].split('\n\n')[0].strip()
                else:
                    txt = row[1]
                ans = row[2].strip()
                cat = txt.split('-')[0].strip()
                txt = '-'.join(txt.split('-')[1:]).strip()
                question_history[f"{season}-{md}-{qnum}"] = {
                    'season': season, 'matchday': md, 'qnum': qnum, 'question': txt, 'answer': ans, 'category': cat,
                    'A_correct': A_correct[idx], 'B_correct': B_correct[idx], 'C_correct': C_correct[idx], 'R_correct': R_correct[idx],
                    'tot_correct': tot_correct[idx], 'defense': avg_def[idx], 'forfeit': None, 'mcw': '', 'mcw_pct': None,
                    'date': md_date
                }
    elif season <= 38: # LL33-38
        md_date = soup.find('h1', {'class': 'matchday'}).text.split(':')[0].strip()
        some_match = soup.find('div', {'class': 'sbGame'})['onclick'].split("'")[1]
        metrics_data = sess.get(f'https://www.learnedleague.com{some_match}')
        metrics_soup = BeautifulSoup(metrics_data.text, 'html.parser')
        qs = metrics_soup.find('table').find_all('tr')[8:14]
        for idx, q in enumerate(qs[:6]):
            if len(q.contents) == 10:
                qnum, txt, _, _, defense, a, b, c, r, tot = list(map(lambda c: c.text, q.contents))
                txt = ' '.join(list(map(
                lambda t: t.text.strip() if hasattr(t, 'text') else str(t).strip(),
                q.contents[1].contents[0:-1])))
                ans = q.contents[1].find('span').text
                try:
                    cat = q.contents[1].contents[0].text
                except:
                    cat = txt.split(' - ')[0]
            else:
                qnum, txt, ans, _, _, defense, a, b, c, r, tot = list(map(lambda c: c.text, q.contents))
                cat = txt.split('-')[0].strip()
            qnum = int(qnum[0])
            defense, a, b, c, r, tot = list(map(int_or_float_or_str, [defense, a, b, c, r, tot]))
            txt = '-'.join(txt.split('-')[1:]).strip()
            question_history[f"{season}-{md}-{qnum}"] = {
                'season': season, 'matchday': md, 'qnum': qnum, 'question': txt, 'answer': ans, 'category': cat,
                'A_correct': a, 'B_correct': b, 'C_correct': c, 'R_correct': r,
                'tot_correct': tot, 'defense': defense, 'forfeit': None, 'mcw': '', 'mcw_pct': None,
                'date': md_date
            }
    elif season <= 42: # LL39-42
        date_data = sess.get(f'https://www.learnedleague.com/ll{season}/questions/md{zfilled_md}.shtml')
        md_date = BeautifulSoup(date_data.text, 'html.parser').find('h1', {'class': 'matchday'}).text.split(':')[0].strip()
        qs = soup.find('table', {'class': 'ind-boxATbl'}).find_all('tr')[2:8]
        for idx, q in enumerate(qs[:6]):
            row = q.find_all('td')[:2]
            qnum = int(row[0].text[0])
            ans = row[1].find('span')
            ans_idx = row[1].index(ans)
            ans = ans.text.strip()
            txt = ' '.join(list(map(
                lambda t: t.text.strip() if hasattr(t, 'text') else str(t).strip(),
                row[1].contents[0:ans_idx])))
            cat = txt.split('-')[0].strip()
            txt = '-'.join(txt.split('-')[1:]).strip()
            a, b, c, r, tot, defense = list(map(lambda c: int_or_float_or_str(c.text), row[1].contents[-6:]))
            question_history[f"{season}-{md}-{qnum}"] = {
                'season': season, 'matchday': md, 'qnum': qnum, 'question': txt, 'answer': ans, 'category': cat,
                'A_correct': a, 'B_correct': b, 'C_correct': c, 'R_correct': r,
                'tot_correct': tot, 'defense': defense, 'forfeit': None, 'mcw': '', 'mcw_pct': None,
                'date': md_date
            }
    elif season <= 50: # LL43-50
        date_data = sess.get(f'https://www.learnedleague.com/ll{season}/questions/md{zfilled_md}.shtml')
        md_date = BeautifulSoup(date_data.text, 'html.parser').find('h1', {'class': 'matchday'}).text.split(':')[0].strip()
        qs = soup.find('div', {'class': 'ind-boxATbl'}).find_all('p')
        metrics = soup.find('table', {'class': 'ind-boxATbl2'}).find_all('tr')
        d, champ = None, None # rundles championship and d introduced in LL48
        if season <= 47: a, b, c, r, tot, defense = list(map(lambda m: m.contents[1:], metrics[-6:]))
        else: champ, a, b, c, d, r, tot, defense = list(map(lambda m: m.contents[1:], metrics[-8:]))
        forfeit = float(tot[0].text.strip())
        for idx, q in enumerate(qs[:6]):
            qnum = int(q.contents[0].text[0])
            ans = q.contents[-1].text
            txt = ' '.join(list(map(
                lambda t: t.text.strip() if hasattr(t, 'text') else str(t).strip(),
                q.contents[1:-1])))
            cat = txt.split('-')[0].strip()
            txt = '-'.join(txt.split('-')[1:]).strip()
            question_history[f"{season}-{md}-{qnum}"] = {
                'season': season, 'matchday': md, 'qnum': qnum, 'question': txt, 'answer': ans, 'category': cat,
                'A_correct': int(a[qnum].text.strip()), 'B_correct': int(b[qnum].text.strip()), 'C_correct': int(c[qnum].text.strip()), 'R_correct': int(r[qnum].text.strip()),
                'tot_correct': int(tot[qnum].text.strip()), 'defense': float(defense[qnum].text.strip()), 'forfeit': forfeit, 'mcw': '', 'mcw_pct': None,
                'date': md_date
            }
            if d: question_history[f"{season}-{md}-{qnum}"]['D_correct'] = int(d[qnum].text.strip())
            if champ: question_history[f"{season}-{md}-{qnum}"]['Champ_correct'] = int(champ[qnum].text.strip())
    # # correct stuff below.
    else:
        if season == 51:
            date_data = sess.get(f'https://www.learnedleague.com/ll{season}/questions/md{zfilled_md}.shtml')
            md_date = BeautifulSoup(date_data.text, 'html.parser').find('h1', {'class': 'matchday'}).text.split(':')[0].strip()
        else: md_date = soup.find('h1', {'class': 'matchday'}).text.split(':')[0].strip()
        qs = soup.findAll('div', attrs={'class': 'ind-Q20'})
        if season == 51: anss = soup.findAll('div', attrs={'class': 'answer3'})
        for idx, q in enumerate(qs[:6]):
            txt = q.text
            # qnum = int(txt[2])
            qnum = idx + 1
            txt = txt[4:].strip()
            cat = txt.split('-')[0].strip()
            txt = '-'.join(txt.split('-')[1:]).strip()
            extra = ''
            if season > 56: extra = '1'
            elif idx >= 6: extra = str(idx//6 + 1)
            if season == 51: ans = anss[idx].text
            else: ans = soup.find('div', attrs={'id': f"Q{idx%6+1}{extra}ANS"}).text.strip()
            # if season <= 55:
            #     ans = soup.find('div', attrs={'id': f"Q{idx+1}ANS"}).text.strip()
            # else:
            #     ans = soup.find('div', attrs={'id': f"Q{idx+1}1ANS"}).text.strip()
                # ans = anss[idx].text.split('\n\t')[1].strip()
            mcw, mcw_pct = get_mcw(f'{season}-{md}-{qnum}')
            question_history[f"{season}-{md}-{qnum}"] = {
                'season': season, 'matchday': md, 'qnum': qnum, 'question': txt, 'answer': ans, 'category': cat,
                'mcw': mcw, 'mcw_pct': mcw_pct, 'date': md_date
            }
        if season >= 60: # newest seasons
            metrics = soup.find('div', attrs={'id': 'rght'})
            for lvl in ['A', 'B', 'C', 'D', 'E', 'R']:
                lvl_data = [x.text for x in metrics.findAll('td', attrs={'class': f'level{lvl}'})]
                for idx in range(1,7):
                    question_history[f"{season}-{md}-{idx}"][f"{lvl}_correct"] = int(lvl_data[idx+1])
            leaguewide, defense = metrics.find('tfoot').findAll('tr')
            leaguewide, defense = leaguewide.findAll('td'), defense.findAll('td')
            for idx in range(1,7):
                question_history[f"{season}-{md}-{idx}"]['tot_correct'] = int(leaguewide[idx+1].text)
                question_history[f"{season}-{md}-{idx}"]['forfeit'] = float(leaguewide[1].text)
                question_history[f"{season}-{md}-{idx}"]['defense'] = float(defense[idx+1].text)
        elif season >= 52: # new-ish seasons
            for row in soup.find('table', attrs={'class': 'ind-boxATbl2'}).findAll('tr'):
                row = [x.text.strip() for x in row.findAll('td')]
                if row[0].startswith('All ') and (len(row[0]) == 5 or row[0].endswith('(Class 1)')):
                    for idx in range(1,7):
                        question_history[f"{season}-{md}-{idx}"][f"{row[0][4]}_correct"] = int(row[idx+1])
                elif row[0] == 'Totals' or row[0] == 'Leaguewide' or row[0] == 'Leaguewide: Class 1':
                    for idx in range(1,7):
                        question_history[f"{season}-{md}-{idx}"]["tot_correct"] = int(row[idx+1])
                        question_history[f"{season}-{md}-{idx}"]["forfeit"] = float(row[1])
                elif row[0] == 'Defense' or row[0] == 'Defense: Class 1':
                    for idx in range(1,7):
                        question_history[f"{season}-{md}-{idx}"]["defense"] = float(row[idx+1])
                elif row[0].startswith('Champ'):
                    for idx in range(1,7):
                        question_history[f"{season}-{md}-{idx}"]["Champ_correct"] = int(row[idx+1])
        elif season == 51:
            metrics = soup.find('table', {'class': 'ind-boxATbl2'}).find_all('tr')
            champ, a, b, c, d, r, tot, defense = list(map(lambda m: m.contents[1:], metrics[-8:]))
            forfeit = float(tot[0].text.strip())
            for idx in range(1,7):
                question_history[f"{season}-{md}-{idx}"]['Champ_correct'] = int(champ[idx].text.strip())
                question_history[f"{season}-{md}-{idx}"]['A_correct'] = int(a[idx].text.strip())
                question_history[f"{season}-{md}-{idx}"]['B_correct'] = int(b[idx].text.strip())
                question_history[f"{season}-{md}-{idx}"]['C_correct'] = int(c[idx].text.strip())
                question_history[f"{season}-{md}-{idx}"]['D_correct'] = int(d[idx].text.strip())
                question_history[f"{season}-{md}-{idx}"]['R_correct'] = int(r[idx].text.strip())
                question_history[f"{season}-{md}-{idx}"]['tot_correct'] = int(tot[idx].text.strip())
                question_history[f"{season}-{md}-{idx}"]['defense'] = float(defense[idx].text.strip())
                question_history[f"{season}-{md}-{idx}"]['forfeit'] = forfeit
    return [question_history[f"{season}-{md}-{x}"] for x in range(1,7)]

In [452]:
for seas in range(30, 95): # getting questions from seasons 30-94
    for day in range(1, 26 if seas > 38 else 20): # matchdays 1-25
        get_question(seas, day)
    print(f"done with season {seas}")

done with season 52
done with season 53
done with season 54
done with season 55


In [453]:
len(question_history)

9366

In [454]:
with open('questions.pkl', 'wb') as fp:
    pickle.dump(question_history, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [455]:
with open('questions.pkl', 'rb') as fp:
    full_question_history = pickle.load(fp)

In [456]:
df = pd.DataFrame.from_dict(question_history, orient='index')
df['D_correct'] = df['D_correct'].astype('Int64')
df['E_correct'] = df['E_correct'].astype('Int64')
df['Champ_correct'] = df['Champ_correct'].astype('Int64')
df['mcw_pct'] = df['mcw_pct'].astype('Int64')
reordered_columns = ['season', 'matchday', 'qnum', 'question', 'answer', 'category',
    'A_correct', 'B_correct', 'C_correct', 'D_correct', 'E_correct', 'R_correct',  'Champ_correct', 'tot_correct',
    'defense', 'forfeit', 'mcw', 'mcw_pct', 'date']
df = df[reordered_columns]
df.sort_values(by=['season', 'matchday', 'qnum'], ascending=True, inplace=True)
df.head()

Unnamed: 0,season,matchday,qnum,question,answer,category,A_correct,B_correct,C_correct,D_correct,E_correct,R_correct,Champ_correct,tot_correct,defense,forfeit,mcw,mcw_pct,date
30-1-1,30,1,1,Mesothelioma is a form of cancer which is virt...,ASBESTOS,SCIENCE,71,69,41,,,71,,61,1.5,,,,"July 10, 2006"
30-1-2,30,1,2,Playwright Jonathan Larson died unexpectedly i...,RENT,THEATRE,88,86,56,,,71,,75,1.5,,,,"July 10, 2006"
30-1-3,30,1,3,"Former caterer, chef, author, entrepreneur, co...",BAREFOOT CONTESSA,FOOD/DRINK,50,47,28,,,29,,40,1.7,,,,"July 10, 2006"
30-1-4,30,1,4,"According the U.S. census of 1900, what were t...","NEW YORK, CHICAGO, PHILADELPHIA",AMER HIST,54,39,28,,,29,,38,1.8,,,,"July 10, 2006"
30-1-5,30,1,5,Identify the American novelist and author of t...,SUE GRAFTON,LITERATURE,67,58,44,,,43,,54,1.7,,,,"July 10, 2006"


In [458]:
df.to_csv('FULL_QUESTION_HISTORY_v2.csv')