In [1]:
import os
import re
import pandas as pd
import numpy as np
from tika import parser

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
os.listdir('../processed_pdf')

['2020.pdf', '2021.pdf', '2023.pdf', '2022.pdf']

## Read Raw Data

In [5]:
raw = parser.from_file('../processed_pdf/2020.pdf')
# print(raw['content'])

In [6]:
raw = raw['content'].split('\n') 

In [7]:
print(len(raw))
raw = [row for row in raw if row != '']
print(len(raw))

5646
4200


## Get Tier

In [8]:
def get_tier(raw):
    tier_dict = {}
    tier_flag = 0
    tier_start = 1
    
    for i, row in enumerate(raw, 0):
        if row.startswith('TIER'):
            key, value = row.split(': ')
            tier_dict[f'TIER {tier_start}'] = value.split('…')[0]
            tier_flag = 1
            tier_start += 1
        else:
            if tier_flag == 1:
                return tier_dict, raw[i:]

In [9]:
tier_dict, raw = get_tier(raw)

In [10]:
tier_dict

{'TIER 1': 'VERY REAL SUPERSTAR UPSIDE',
 'TIER 2': 'REALIZABLE ALL-STAR UPSIDE',
 'TIER 3': 'HIGH LEVERAGE STARTERS',
 'TIER 4': 'UPSIDE SWINGS, POTENTIAL STARTERS ',
 'TIER 5': 'ROTATION PLAYERS',
 'TIER 6': 'SECOND ROUND FLIERS AND PRIORITY TWO WAYS',
 'TIER 7': 'LOWER TIER TWO WAYS AND UNDRAFTED FLIERS'}

## Get Big Board

In [11]:
def get_raw_big_board(raw):
    raw_bb_dict = {}
    bb_flag = 0
    bb_data = []
    
    for i, row in enumerate(raw, 0):
        if row.startswith('Rank'):
            raw_bb_dict['columns'] = row
            bb_flag = 1
        else:
            if bb_flag == 1:
                if row.split(' ')[0] != '100':
                    if ' 4 ' not in row:
                        bb_data.append(row)
                else:
                    bb_data.append(row)
                    raw_bb_dict['data'] = bb_data
                    return raw_bb_dict, raw[i+1:]

In [12]:
raw_bb_dict, raw = get_raw_big_board(raw)

In [13]:
def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start+len(needle))
        n -= 1
    return start

In [14]:
data_adj = []

for row in raw_bb_dict['data']:
    idx = find_nth(row, ' ', 3)
    row_adj = row[:idx] + ', ' + row[idx+1:]
    data_adj.append(row_adj.strip())
    
# Minor Edit
data_adj[11] = '12 Kira Lewis Jr., Alabama G 19 6-3 175 6-6 20'

# Update
raw_bb_dict['data'] = data_adj

In [15]:
def build_big_board(raw_bb_dict):
    columns = [col.replace('.', '').replace(',', '')
               for col in raw_bb_dict['columns'].strip().split(' ')][:-2]
    
    data = [row for row in raw_bb_dict['data']
            if len(row.split(' ')[0]) <= 3]
    
    processed_data = []
    for row in data:
        front, back = row.split(', ')
        row_adj = [front.split(' ')[0]] + [' '.join(front.split(' ')[1:])] + \
                  [' '.join(back.split(' ')[:-6])] + back.split(' ')[-6:]
        processed_data.append(row_adj)
    
    return pd.DataFrame(processed_data, columns=columns)

In [16]:
big_board = build_big_board(raw_bb_dict)
big_board

Unnamed: 0,Rank,Player,Team,Position,Age,Height,Weight,Wingspan,Scouting
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,175,6-9,6
1,2,James Wiseman,Memphis,C,19,7-1,245,7-6,7
2,3,Anthony Edwards,Georgia,G,19,6-5,225,6-10,8
3,4,Onyeka Okongwu,USC,F/C,19,6-9,245,7-1,10
4,5,Isaac Okoro,Auburn,W,19,6-6,225,6-9,11
5,6,Deni Avdija,Maccabi Tel Aviv,W/F,19,6-9,225,6-10,12
6,7,Tyrese Haliburton,Iowa State,G,20,6-5,175,6-7,14
7,8,Obi Toppin,Dayton,F/C,22,6-9,230,,15
8,9,Patrick Williams,Florida State,F,19,6-8,225,6-11,16
9,10,Killian Hayes,Ratiopharm Ulm,G,19,6-5,215,6-6,17
