In [3]:
import sys
import os

PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.abspath('')
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

In [4]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd
import re
import pickle

import importlib
import utils
importlib.reload(utils)
from tqdm import tqdm

from utils import get_parsed, get_parsed_remove_comment, baseball_base_url, ordinal, clean_name

In [5]:
df_all_leaders = pd.read_csv('./all_leaders.csv')

In [6]:
df_all_leaders.head(3)

Unnamed: 0,Statistic,Single-Season,Career,Active,Progressive,Yearly League,Year-by-Year Top-Tens,Category
0,Wins Above Replacement,/leaders/WAR_season.shtml,/leaders/WAR_career.shtml,/leaders/WAR_active.shtml,/leaders/WAR_progress.shtml,/leaders/WAR_leagues.shtml,/leaders/WAR_top_ten.shtml,Player Value Leaderboards
1,WAR Position Players,/leaders/WAR_bat_season.shtml,/leaders/WAR_bat_career.shtml,/leaders/WAR_bat_active.shtml,/leaders/WAR_bat_progress.shtml,/leaders/WAR_bat_leagues.shtml,/leaders/WAR_bat_top_ten.shtml,Batting Leaderboards
2,Offensive WAR,/leaders/WAR_off_season.shtml,/leaders/WAR_off_career.shtml,/leaders/WAR_off_active.shtml,/leaders/WAR_off_progress.shtml,/leaders/WAR_off_leagues.shtml,/leaders/WAR_off_top_ten.shtml,Batting Leaderboards


In [7]:
df_yby = df_all_leaders[['Statistic', 'Year-by-Year Top-Tens', 'Category']]
df_yby.head(2)

Unnamed: 0,Statistic,Year-by-Year Top-Tens,Category
0,Wins Above Replacement,/leaders/WAR_top_ten.shtml,Player Value Leaderboards
1,WAR Position Players,/leaders/WAR_bat_top_ten.shtml,Batting Leaderboards


In [8]:
def process_statistic(table):
    
    def process_grid_box(box):
        table = box.find('table')
        year = table.caption.text
        result = []
        for tr in table.find_all('tr'):
            cell_info = [element.text for element in tr.find_all('td')]
            if cell_info[0].strip() == '':
                cell_info[0] = result[-1][1]
            temp = [year] + cell_info
            result.append(temp)
        return result
    
    all_grid_boxes = table.find_all(attrs={'class': 'data_grid_box'})
    data = []
    for box in all_grid_boxes:
        result = process_grid_box(box)
        data.extend(result)
    
    columns_name = ['Year', 'Rank', 'Name', 'Stat']
    df = pd.DataFrame(data, columns=columns_name)
    return df

In [10]:
all_yby = []
for i in tqdm(range(min(df_yby.shape[0], 1000))):
    
    content = get_parsed(baseball_base_url + df_yby.loc[i, 'Year-by-Year Top-Tens'])
    table = content.find(attrs={'data-entry-type': 'Leaderboards'})
    df = process_statistic(table)
    
    df['Rank'] = df['Rank'].map(lambda x: x[:-1]).astype(int)
    cat = df_yby.loc[i, 'Category']
    cat = " ".join(cat.split(" ")[:-1])
    all_yby.append([(cat, df_yby.loc[i, 'Statistic']), df])

100%|████████████████████████████████████████████████████████████████████████████████| 179/179 [01:55<00:00,  1.56it/s]


In [15]:
testid = 134
print(all_yby[testid][0])
test_df = all_yby[testid][1]
test_df.head(2)

('Fielding', 'Putouts as P')


Unnamed: 0,Year,Rank,Name,Stat
0,2020,1,Lance McCullers Jr.,11
1,2020,1,Pablo Lopez,11


In [17]:
test_df[test_df.loc[:, 'Year'] == '1988'].head(2)

Unnamed: 0,Year,Rank,Name,Stat
368,1988,1,Orel Hershiser,32
369,1988,2,Jack Morris,31


In [18]:
with open('pkl_leaders_year_by_year.pkl', 'wb') as f:
    pickle.dump(all_yby, f)
with open('pkl_leaders_year_by_year.pkl', 'rb') as f:
    all_yby = pickle.load(f)

In [45]:
all_yby[0][1]['Year'].unique().shape

(150,)

In [58]:
wrong_count = 4
columns = ['Question', 'Correct'] + ['Wrong_' + str(i + 1) for i in range(wrong_count)] +  ['tags']
ranking_tag = 'Year by year Ranking'
years_range = range(1980, 2021)
question_list = []

import random
random.seed(1000)
for (cat, stat), df_main in tqdm(all_yby[:min(len(all_yby), 1000)]):
    for year in years_range:
        df_that_year = df_main[df_main['Year'] == str(year)]
        if df_that_year.shape[0] == 0:
            continue
        
        
        # who is number 1st
        rank = 1
        player_or_manager = 'manager' if 'Manager' in cat else 'player'
        question = f'Which of the following {player_or_manager}s is ranked #{rank} ' + \
            f'in the leaderboard for \'{stat} ({cat})\' in {year}?'
        
        correct_list = df_that_year[df_that_year['Rank'] == rank]['Name'].tolist()
        correct = correct_list[0]
        
        # find options
        options_list = df_main[df_main['Rank'] > rank]['Name'].tolist()
        cur = 0
        opt = []
        while len(opt) < wrong_count:
            temp_opt = options_list[cur]
            if temp_opt not in correct_list:
                opt.append(temp_opt)
            cur += 1
        wrongs = opt
        
        # if there is repeating names
        all_options = [clean_name(wrong) for wrong in [correct] + wrongs]
        if len(set(all_options)) != len(all_options):
            continue
        if len(all_options) == 1 + wrong_count:
            tags = [ranking_tag, player_or_manager]
            question_list.append([question] + all_options + [tags])



        # who is ranked highest of the following palyers (basically number one and the rest)

        rank = 2
        player_or_manager = 'manager' if 'Manager' in cat else 'player'
        rank_2_players = df_that_year[df_that_year['Rank'] == rank]['Name'].tolist()
        if len(rank_2_players) == 0:
            continue
        player_name = rank_2_players[0]
        question = f'Which of the following {player_or_manager}s ranks the highest' + \
            f' in the leaderboard for \'{stat} ({cat})\'? in {year}'
        correct = player_name

        # all none ranking 2 players
        wrongs = df_that_year[df_that_year['Rank'] > rank]['Name'].tolist()
        random.shuffle(wrongs)
        wrongs = wrongs[:wrong_count]
        if len(wrongs) < wrong_count:
            continue
        
        all_options = [correct] + wrongs
        if len(set(all_options)) != len(all_options):
            continue
            
        if len(all_options) == 1 + wrong_count:
            tags = [ranking_tag, player_or_manager]
            question_list.append([question] + all_options + [tags])



  0%|                                                                                          | 0/179 [00:00<?, ?it/s][A[A

  1%|▉                                                                                 | 2/179 [00:00<00:16, 10.93it/s][A[A

  2%|█▊                                                                                | 4/179 [00:00<00:15, 11.07it/s][A[A

  3%|██▋                                                                               | 6/179 [00:00<00:15, 11.10it/s][A[A

  4%|███▋                                                                              | 8/179 [00:00<00:15, 11.05it/s][A[A

  6%|████▌                                                                            | 10/179 [00:00<00:14, 11.33it/s][A[A

  7%|█████▍                                                                           | 12/179 [00:01<00:14, 11.19it/s][A[A

  8%|██████▎                                                                          | 14/179 [00:01<00:14, 

 72%|█████████████████████████████████████████████████████████▏                      | 128/179 [00:11<00:04, 11.26it/s][A[A

 73%|██████████████████████████████████████████████████████████                      | 130/179 [00:11<00:04, 11.12it/s][A[A

 74%|██████████████████████████████████████████████████████████▉                     | 132/179 [00:11<00:04, 10.95it/s][A[A

 75%|███████████████████████████████████████████████████████████▉                    | 134/179 [00:11<00:04, 11.06it/s][A[A

 76%|████████████████████████████████████████████████████████████▊                   | 136/179 [00:12<00:03, 11.20it/s][A[A

 77%|█████████████████████████████████████████████████████████████▋                  | 138/179 [00:12<00:03, 11.40it/s][A[A

 78%|██████████████████████████████████████████████████████████████▌                 | 140/179 [00:12<00:03, 11.22it/s][A[A

 79%|███████████████████████████████████████████████████████████████▍                | 142/179 [00:12<00:03, 11

In [60]:
yby_ranking_questions = pd.DataFrame(question_list, columns=columns)

In [61]:
with open('questions_yby_ranking.pkl', 'wb') as f:
    pickle.dump(yby_ranking_questions, f)
with open('questions_yby_ranking.pkl', 'rb') as f:
    yby_ranking_questions = pickle.load(f)

In [62]:
yby_ranking_questions.shape

(13739, 7)