In [1]:
import os
import sys
import logging
from tqdm import tqdm
import requests
import lxml
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
debugger_dict = {}

logger = logging.getLogger('Jeopardy Parser')
f_handler = logging.FileHandler('technical_issues.log')
f_handler.setLevel(logging.INFO)
f_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
f_handler.setFormatter(f_format)
logger.addHandler(f_handler)

def get_categories(jeopardy_round):
    categories = []
    for c in jeopardy_round.find_all('td', class_='category_name'):
        categories.append(c.get_text().title())
    
    return categories
    
def clue_parser(jeopardy_round, contestants):
    rows_to_add = []
    categories = get_categories(jeopardy_round)
    
    for c in jeopardy_round.find_all('td', class_='clue_text'):
        rows_to_add.append([categories[int(c.get('id')[-3:-2])-1], c.get_text()])

    for i, c in enumerate(jeopardy_round.find_all('div', onmouseover=True)):
        c = BeautifulSoup(c.get('onmouseover'))
        rows_to_add[i].append(c.find('em', class_='correct_response').get_text())
        if c.find('td', class_='right'):
            nicknames = [n.get_text().replace("\\'", "'") for n in c.find_all('td', class_='right')]
            names = [contestants[n][0] for n in nicknames]
            rows_to_add[i].append(' and '.join(names))
        else:
            rows_to_add[i].append(None)
        if c.find_all('td', class_='wrong') and c.find_all('td', class_='wrong')[-1].get_text() == 'Triple Stumper':
            rows_to_add[i].append(True)
        else:
            rows_to_add[i].append(False)
        if c.find_all('td', class_='wrong') and len(c.find_all('td', class_='wrong')) == 4:
            rows_to_add[i].append(True)
        else:
            rows_to_add[i].append(False)
     
    for i, c in enumerate(jeopardy_round.find_all('table', class_='clue_header')):
        if c.find('td', class_='clue_value'):
            rows_to_add[i].append(c.find('td', class_='clue_value').get_text())
            rows_to_add[i].append(False)
        else:
            rows_to_add[i].append(c.find('td', class_='clue_value_daily_double').get_text()[4:])
            rows_to_add[i].append(True)
            
    for i, c in enumerate(jeopardy_round.find_all('td', class_='clue_order_number')):
        rows_to_add[i].append(c.get_text())
    
    return rows_to_add
    
def round_parser(show_body, show_round, contestants, episode, date):
    jeopardy_round = show_body.find('div', id=show_round)
    logger = logging.getLogger('Jeopardy Parser')
    
    if not jeopardy_round:
        logger.warning(f"Issues found while parsing clues in {show_round}. Div tag for round not detected. {debugger_dict['url']}")
        return pd.DataFrame()
        
    rows_to_add = clue_parser(jeopardy_round, contestants)
    
    colnames = ['Category', 'Clue', 'Correct Response', 'Answered Correctly', 'Triple Stumper', 'All Wrong Answers', 'Clue Value', 'Daily Double',
                'Selection Order']
    show_df = pd.DataFrame(columns=colnames)
    
    for row in rows_to_add:
        show_df = show_df.append(pd.Series(row, index=colnames), ignore_index=True)
        
    show_df['Episode'] = episode
    show_df['Date First Aired'] = date
    show_df['Round'] = jeopardy_round.find('h2').get_text()
    
    return show_df
    
def get_nickname(show_contestants, nickname, name_check, matched):
    debugger_dict['name_check'] = name_check
    if not name_check:
        return nickname
    
    logger = logging.getLogger('Jeopardy Parser')
    
    lookup = {p.get_text().split(', ', 1)[0].split()[0] for p in show_contestants.table.td.next_sibling.next_sibling.find_all('p')}
    for name in matched:
        lookup.discard(name)
    
    if len(name_check.difference(lookup)) == 1:
        return name_check.difference(lookup).pop()
    
    name_check, lookup = sorted(list(name_check.difference(lookup))), sorted(list(lookup.difference(name_check)))
    
    for idx, (name1, name2) in enumerate(zip(name_check, lookup)):
        if nickname == name2:
            logger.warning(f"Issues found with contestant names. ({name1}, {name2}) {debugger_dict['url']}.")
            return name1

def team_contestants_parser(show_contestants, episode, date, name_check, terminate_control):
    contestants = {}
    logger = logging.getLogger('Jeopardy Parser')
    
    if terminate_control:
        try:
            name_check = {s for s in show_contestants.find('div', id='double_jeopardy_round').table.find_next_sibling('table').tr.stripped_strings}
            terminate_control = False
        except AttributeError:
            logger.warning(f"Issues found while searching for teams. Unable to verify teams. {debugger_dict['url']}")
    
    teams = [tn for t in show_contestants.find(id='contestants_table').find_all('h3') for tn in [t.get_text().split(' (')[0]]*3]
    leaders = [tn.get_text().split()[1] for tn in show_contestants.find(id='contestants_table').find_all('h3')]
    member_lookup = {}
    occupation_lookup = {}
    place_lookup = {}
    
    for c, team in zip(show_contestants.find_all('p', class_='contestants'), teams):
        show_round, bio = c.get_text().split(': ', 1)
        name, bio = bio.split(', ', 1)
        contestants[name.split(' ')[0]] = [f'{name} ({team})']
        member_lookup[team] = {**member_lookup.get(team, {}), **{show_round.split(' the ', 1)[1]: name}}
        occupation_lookup[team] = {**occupation_lookup.get(team, {}), **{name: bio.split(' from ', 1)[0]}}
        place_lookup[team] = {**place_lookup.get(team, {}), **{name: bio.split(' from ', 1)[1]}}
    
    for l in leaders:
        contestants[l].append({'Team '+l: member_lookup['Team '+l]})
        contestants[l].append(occupation_lookup['Team '+l])
        contestants[l].append(place_lookup['Team '+l])
        contestants[l].extend([episode, date])
    
    debugger_dict['contestants'] = contestants
    final_round = show_contestants.find('div', id='final_jeopardy_round')
    
    if final_round:
        wagers = [i.get_text().replace("\\'", "'") for i in BeautifulSoup(final_round.find('div', onmouseover=True).get('onmouseover')).table.find_all('td')]
    
        for c, a, w in zip(wagers[::3], wagers[1::3], wagers[2::3]):
            contestants[contestants[c][0].split('(')[1][5:-1]].append(final_round.find('td', class_='category_name').get_text().title())
            contestants[contestants[c][0].split('(')[1][5:-1]].append(final_round.find('td', class_='clue_text').get_text())
            contestants[contestants[c][0].split('(')[1][5:-1]].append(BeautifulSoup(final_round.find('div', onmouseover=True).get('onmouseover')).find('em').get_text())
            contestants[contestants[c][0].split('(')[1][5:-1]].extend([a, w])
    else:
        for l in leaders:
            contestants[l].extend([None, None, None, None, None])
            logger.warning(f"Issues found while parsing final jeopardy round. Div tag for round missing {debugger_dict['url']}")
            
    for show_round in ['jeopardy_round', 'double_jeopardy_round', 'final_jeopardy_round']:
        try:
            for c in show_contestants.find('div', id=show_round).table.find_next_siblings('table'):
                for name, score in zip(c.tr.find_all('td'), c.tr.find_next_sibling('tr').find_all('td')):
                    contestants[name.get_text().split()[1]].append(score.get_text())
        except:
            for l in leaders:
                if show_round != 'double_jeopardy_round':
                    contestants[l].extend([None, None])
                else:
                    contestants[l].append(None)
            logger.warning(f"Issues found while parsing {show_round}. Div tag for round missing. {debugger_dict['url']}")
        
    return contestants
    
def contestants_parser(show_contestants, episode, date):
    contestants, matched = {}, []
    terminate_control = False
    logger = logging.getLogger('Jeopardy Parser')
    
    try:
        name_check = {s for s in show_contestants.find('div', id='jeopardy_round').table.find_next_sibling('table').tr.stripped_strings}
    except AttributeError:
        try:
            name_check = {s for s in show_contestants.find('div', id='double_jeopardy_round').table.find_next_sibling('table').tr.stripped_strings}
        except AttributeError:
            name_check = {}
            terminate_control = True
            logger.warning(f"Issues found while searching for contestants names. Unable to verify contestant names. {debugger_dict['url']}")
    
    if name_check == {n.get_text().split(' (')[0] for n in show_contestants.find(id='contestants_table').find_all('h3') if n}:
        return team_contestants_parser(show_contestants, episode, date, name_check, terminate_control)
    
    for c in show_contestants.find_all('p', class_='contestants'):
        name, bio = c.get_text().split(', ', 1)
        nickname = name.split()[0]
        
        if name_check and nickname in name_check:
            name_check.discard(nickname)
            matched.append(nickname)
        else:
            nickname = get_nickname(show_contestants, nickname, name_check.copy(), matched)
        
        if 'originally' in bio:
            occupation, place = bio.split(' originally from ')
        else:
            occupation, place = bio.split(' from ', 1)
        if '(' in place:
            place = place.split(' (', 1)[0]
            
        contestants[nickname] = [name, occupation, place, episode, date]
    
    debugger_dict['contestants'] = contestants
    
    if terminate_control:
        return contestants
    
    name_lookup = set(contestants.keys())
    final_round = show_contestants.find('div', id='final_jeopardy_round')
    
    if final_round:
        for c in contestants.values():
            c.append(final_round.find('td', class_='category_name').get_text().title())
            c.append(final_round.find('td', class_='clue_text').get_text())
            c.append(BeautifulSoup(final_round.find('div', onmouseover=True).get('onmouseover')).find('em').get_text())
    
        wagers = [i.get_text().replace("\\'", "'") for i in BeautifulSoup(final_round.find('div', onmouseover=True).get('onmouseover')).table.find_all('td')]
    
        for c, a, w in zip(wagers[::3], wagers[1::3], wagers[2::3]):
            name_lookup.discard(c)
            contestants[c].extend([a, w])

        if name_lookup:
            while name_lookup:
                c = name_lookup.pop()
                contestants[c].extend([None, None])
    else:        
        for c in contestants.values():
            c.extend([None, None, None, None, None])
        logger.warning(f"Issues found while parsing final jeopardy round. Div tag for round missing {debugger_dict['url']}")
    
    for show_round in ['jeopardy_round', 'double_jeopardy_round', 'final_jeopardy_round']:
        try:
            for c in show_contestants.find('div', id=show_round).table.find_next_siblings('table'):
                for name, score in zip(c.tr.find_all('td'), c.tr.find_next_sibling('tr').find_all('td')):
                    contestants[name.get_text()].append(score.get_text())
        except:
            for c in contestants.values():
                if show_round != 'double_jeopardy_round':
                    c.extend([None, None])
                else:
                    c.append(None)
            logger.warning(f"Issues found while parsing {show_round}. Div tag for round missing. {debugger_dict['url']}")
        
    return contestants

def show_parser(contestants_df, season_df, url):
    debugger_dict['url'] = url
    show_page = requests.get(url)
    parsed_show = BeautifulSoup(show_page.content, 'lxml')
    show_body = parsed_show.find('div', id='content')
    episode, date = show_body.find('h1').get_text().split(' - ')
    
    debugger_dict['body_content'] = show_body
    logger = logging.getLogger('Jeopardy Parser')
    
    contestants = contestants_parser(show_body, episode, date)
    
    for show_round in ['jeopardy_round', 'double_jeopardy_round']:
        round_df = round_parser(show_body, show_round, contestants, episode, date)
        if not round_df.empty:
            season_df = pd.concat([season_df, round_df], ignore_index=True)
    
    for row in contestants.values():
        if len(row) >= 14:
            if '(Team' in row[0]:
                row.pop(0)
            contestants_df = contestants_df.append(pd.Series(row[:14], index=contestants_df.columns), ignore_index=True)
        else:
            logger.warning(f"Issues found with contestants info. Contestants' data not parsed. {url}")
        
    return contestants_df, season_df  

def season_parser(contestants_df, season_link):
    show_links = []
    season_page = requests.get(season_link)
    parsed_season = BeautifulSoup(season_page.content, 'lxml')
    season_body = parsed_season.find('div', id='content')

    for a in season_body.find_all('a', href=True): 
        if a.text and a['href'].startswith('https://www.j-archive.com/showgame.php?game_id'): 
            show_links.append(a['href'])          
          
    colnames = ['Category', 'Clue', 'Correct Response', 'Answered Correctly', 'Triple Stumper', 'All Wrong Answers', 'Clue Value', 'Daily Double',
                'Selection Order', 'Episode', 'Date First Aired', 'Round']
    season_df = pd.DataFrame(columns=colnames)
    
    print(f"Scraping Season {season_link.split('=')[-1]}")
    
    for url in tqdm(show_links[::-1], 'Season Parser Progress'):
        contestants_df, season_df = show_parser(contestants_df, season_df, url)
        debugger_dict.clear()
        
    file_path = './data'
    
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    season_df.to_csv(file_path+'/jeopardy_season_'+season_link.split('=')[-1]+'.csv', index=False)
    contestants_df.to_csv(file_path+'/contestants.csv', index=False)
    
    return contestants_df

def jeopardy_parser():
    season_links = []
    
    main_page = requests.get('https://j-archive.com/listseasons.php')
    parsed_main = BeautifulSoup(main_page.content, 'lxml')
    main_body = parsed_main.find('div', id = 'content')
    
    for a in main_body.find_all('a', href=True): 
        if a.text: 
            season_links.append('https://j-archive.com/'+a['href'])
            
    colnames = ['Contestant Name', 'Occupation', 'Location', 'Episode', 'Date First Aired', 'Final Jeopardy Category', 'Final Jeopardy Clue',
                'Correct Answer', 'Contestant Written Response', 'Contestant Wager', 'First Break Score', 'Jeopardy Round Score',
                'Double Jeopardy Round Score', 'Final Jeopardy Score']
    contestants_df = pd.DataFrame(columns=colnames)
    
    for url in season_links[-2::-1]:
        contestants_df = season_parser(contestants_df, url)
    
    return
        

In [3]:
jeopardy_parser()

Season Parser Progress:   0%|          | 0/52 [00:00<?, ?it/s]

Scraping Season 1


Season Parser Progress: 100%|██████████| 52/52 [01:17<00:00,  1.49s/it]
Season Parser Progress:   0%|          | 0/77 [00:00<?, ?it/s]

Scraping Season 2


Season Parser Progress: 100%|██████████| 77/77 [01:45<00:00,  1.37s/it]
Season Parser Progress:   0%|          | 0/141 [00:00<?, ?it/s]

Scraping Season 3


Season Parser Progress: 100%|██████████| 141/141 [03:30<00:00,  1.50s/it]
Season Parser Progress:   0%|          | 0/168 [00:00<?, ?it/s]

Scraping Season 4


Season Parser Progress: 100%|██████████| 168/168 [04:21<00:00,  1.56s/it]
Season Parser Progress:   0%|          | 0/151 [00:00<?, ?it/s]

Scraping Season 5


Season Parser Progress: 100%|██████████| 151/151 [03:54<00:00,  1.55s/it]
Season Parser Progress:   0%|          | 0/169 [00:00<?, ?it/s]

Scraping Season 6


Season Parser Progress: 100%|██████████| 169/169 [03:53<00:00,  1.38s/it]
Season Parser Progress:   0%|          | 0/13 [00:00<?, ?it/s]

Scraping Season superjeopardy


Season Parser Progress: 100%|██████████| 13/13 [00:20<00:00,  1.62s/it]
Season Parser Progress:   0%|          | 0/93 [00:00<?, ?it/s]

Scraping Season 7


Season Parser Progress: 100%|██████████| 93/93 [02:56<00:00,  1.90s/it]
Season Parser Progress:   0%|          | 0/115 [00:00<?, ?it/s]

Scraping Season 8


Season Parser Progress: 100%|██████████| 115/115 [02:56<00:00,  1.53s/it]
Season Parser Progress:   0%|          | 0/106 [00:00<?, ?it/s]

Scraping Season 9


Season Parser Progress: 100%|██████████| 106/106 [02:58<00:00,  1.68s/it]
Season Parser Progress:   0%|          | 0/91 [00:00<?, ?it/s]

Scraping Season 10


Season Parser Progress: 100%|██████████| 91/91 [02:24<00:00,  1.59s/it]
Season Parser Progress:   0%|          | 0/87 [00:00<?, ?it/s]

Scraping Season 11


Season Parser Progress: 100%|██████████| 87/87 [02:16<00:00,  1.56s/it]
Season Parser Progress:   0%|          | 0/178 [00:00<?, ?it/s]

Scraping Season 12


Season Parser Progress: 100%|██████████| 178/178 [04:54<00:00,  1.65s/it]
Season Parser Progress:   0%|          | 0/215 [00:00<?, ?it/s]

Scraping Season 13


Season Parser Progress: 100%|██████████| 215/215 [05:04<00:00,  1.41s/it]
Season Parser Progress:   0%|          | 0/229 [00:00<?, ?it/s]

Scraping Season 14


Season Parser Progress: 100%|██████████| 229/229 [05:00<00:00,  1.31s/it]
Season Parser Progress:   0%|          | 0/229 [00:00<?, ?it/s]

Scraping Season 15


Season Parser Progress: 100%|██████████| 229/229 [04:36<00:00,  1.21s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 16


Season Parser Progress: 100%|██████████| 230/230 [04:40<00:00,  1.22s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 17


Season Parser Progress: 100%|██████████| 230/230 [04:37<00:00,  1.21s/it]
Season Parser Progress:   0%|          | 0/229 [00:00<?, ?it/s]

Scraping Season 18


Season Parser Progress: 100%|██████████| 229/229 [05:50<00:00,  1.53s/it]
Season Parser Progress:   0%|          | 0/228 [00:00<?, ?it/s]

Scraping Season 19


Season Parser Progress: 100%|██████████| 228/228 [04:40<00:00,  1.23s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 20


Season Parser Progress: 100%|██████████| 230/230 [05:38<00:00,  1.47s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 21


Season Parser Progress: 100%|██████████| 230/230 [06:58<00:00,  1.82s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 22


Season Parser Progress: 100%|██████████| 230/230 [06:30<00:00,  1.70s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 23


Season Parser Progress: 100%|██████████| 230/230 [06:47<00:00,  1.77s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 24


Season Parser Progress: 100%|██████████| 230/230 [06:48<00:00,  1.77s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 25


Season Parser Progress: 100%|██████████| 230/230 [06:39<00:00,  1.74s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 26


Season Parser Progress: 100%|██████████| 230/230 [07:25<00:00,  1.94s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 27


Season Parser Progress: 100%|██████████| 230/230 [07:23<00:00,  1.93s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 28


Season Parser Progress: 100%|██████████| 230/230 [07:29<00:00,  1.96s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 29


Season Parser Progress: 100%|██████████| 230/230 [07:33<00:00,  1.97s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 30


Season Parser Progress: 100%|██████████| 230/230 [07:10<00:00,  1.87s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 31


Season Parser Progress: 100%|██████████| 230/230 [08:01<00:00,  2.09s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 32


Season Parser Progress: 100%|██████████| 230/230 [08:08<00:00,  2.12s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 33


Season Parser Progress: 100%|██████████| 230/230 [08:02<00:00,  2.10s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 34


Season Parser Progress: 100%|██████████| 230/230 [07:29<00:00,  1.95s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 35


Season Parser Progress: 100%|██████████| 230/230 [07:44<00:00,  2.02s/it]
Season Parser Progress:   0%|          | 0/190 [00:00<?, ?it/s]

Scraping Season 36


Season Parser Progress: 100%|██████████| 190/190 [07:04<00:00,  2.24s/it]
Season Parser Progress:   0%|          | 0/8 [00:00<?, ?it/s]

Scraping Season goattournament


Season Parser Progress: 100%|██████████| 8/8 [00:15<00:00,  1.88s/it]
Season Parser Progress:   0%|          | 0/230 [00:00<?, ?it/s]

Scraping Season 37


Season Parser Progress: 100%|██████████| 230/230 [08:49<00:00,  2.30s/it]
Season Parser Progress:   0%|          | 0/14 [00:00<?, ?it/s]

Scraping Season 38


Season Parser Progress: 100%|██████████| 14/14 [00:29<00:00,  2.09s/it]
