In [1]:
import os
import re
import pandas as pd
import numpy as np
from tika import parser

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
os.listdir('../processed_pdf')

['2020.pdf', '2021.pdf', '2023.pdf', '2022.pdf']

## Read Raw Data

In [5]:
raw = parser.from_file('../processed_pdf/2023.pdf')
# print(raw['content'])

In [6]:
raw = raw['content'].split('\n') 

In [7]:
print(len(raw))
raw = [row for row in raw if row != '']
print(len(raw))

9401
7066


## Get Tier

In [8]:
def get_tier(raw):
    tier_dict = {}
    tier_flag = 0
    
    for i, row in enumerate(raw, 0):
        if row.startswith('Tier'):
            key, value = row.split(': ')
            tier_dict[key] = value
            tier_flag = 1
        else:
            if tier_flag == 1:
                return tier_dict, raw[i:]

In [9]:
tier_dict, raw = get_tier(raw)

In [10]:
tier_dict

{'Tier Vic': 'Victor Wembanyama',
 'Tier 1': 'Projectable All-NBA Upside',
 'Tier 2': 'Projectable All-Star Upside',
 'Tier 3': 'High-Leverage Starters',
 'Tier 4': 'Starter/All-Star Tool Swings',
 'Tier 6': 'Second-Round Guarantee Swings',
 'Tier 7': 'Priority Two-Ways',
 'Tier 8': 'Two-Ways, Stashes, Exhibit 10s',
 'Tier 5': 'Rotation Players and Upside Swings'}

## Get Big Board

In [11]:
def get_raw_big_board(raw):
    raw_bb_dict = {}
    bb_flag = 0
    bb_data = []
    
    for i, row in enumerate(raw, 0):
        if row.startswith('RANK'):
            raw_bb_dict['columns'] = row
            bb_flag = 1
        else:
            if bb_flag == 1:
                if row.split(' ')[0] != '100':
                    bb_data.append(row)
                else:
                    bb_data.append(row)
                    raw_bb_dict['data'] = bb_data
                    return raw_bb_dict, raw[i+1:]

In [12]:
raw_bb_dict, raw = get_raw_big_board(raw)

In [13]:
def build_big_board(raw_bb_dict):
    columns = [col.replace('.', '').replace(',', '')
               for col in raw_bb_dict['columns'].split(' ')]
    
    data = [row for row in raw_bb_dict['data']
            if len(row.split(' ')[0]) <= 3]
    
    processed_data = []
    for row in data:
        front, back = row.split(', ')
        row_adj = [front.split(' ')[0]] + [' '.join(front.split(' ')[1:])] + \
                  [' '.join(back.split(' ')[:-5])] + back.split(' ')[-5:]
        processed_data.append(row_adj)
    
    return pd.DataFrame(processed_data, columns=columns)

In [14]:
big_board = build_big_board(raw_bb_dict)
big_board

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER
0,1,Victor Wembanyama,Metropolitans 92,C,19,7-4,8-0,Vic
1,2,Scoot Henderson,G League Ignite,G,19,6-2,,1
2,3,Cam Whitmore,Villanova,W,18,6-6,6-9,2
3,4,Brandon Miller,Alabama,W/F,20,6-8,,2
4,5,Amen Thompson,Overtime Elite,G,20,6-6,7-0,2
5,6,Jarace Walker,Houston,F,19,6-7,7-3,3
6,7,Taylor Hendricks,UCF,W/F,19,6-8,7-1,3
7,8,Anthony Black,Arkansas,G/W,19,6-6,6-8,3
8,9,Ausar Thompson,Overtime Elite,W,20,6-6,7-0,3
9,10,Dereck Lively II,Duke,C,19,7-1,,4


In [15]:
big_board['PLAYER'] = big_board['PLAYER'].apply(lambda x: 'Gregory “G.G.” Jackson' if x == 'G.G. Jackson' else x)\
                                         .apply(lambda x: "Sir’Jabari Rice" if x == "Sir'Jabari Rice" else x)\
                                         .apply(lambda x: 'Tosan Evbuomwan' if x == 'Tosan Evbuomwam' else x)

## Get Scouting Report

### Build Raw Dict

In [16]:
raw = [row.replace('\t', ' ') for row in raw]

In [17]:
def identify_scouting_report(raw, big_board):
    sr_dict = {}
    idx_breaks = []
    players_identified = []

    for player in big_board.PLAYER.values:
        for i, row in enumerate(raw, 0):
            if row.endswith(player):
                idx_breaks.append(i)
                players_identified.append(player)
                break
    
    idx_breaks = idx_breaks + [len(raw)]

    for i, player in enumerate(players_identified, 0):
        start = idx_breaks[i]
        end = idx_breaks[i+1]
        sr_dict[player] = [row.lstrip(' ') for row in raw[start:end]]
        
    # report
    print(f'# of players identified: {len(players_identified)}')
    
    return sr_dict

In [18]:
sr_dict = identify_scouting_report(raw, big_board)

# of players identified: 75


In [19]:
def check_categories(sc):
    n_err = 0
    for cat in ['BACKGROUND', 'STRENGTHS', 'WEAKNESSES', 'SUMMARY']:
        if cat not in sc:
            print(cat)
            n_err += 1
    return n_err

In [20]:
for player in sr_dict.keys():
    if check_categories(sr_dict[player]) >= 1:
        print(player)

### Clean Scouting Report

In [21]:
def clean_scouting_report(sr):
    school = sr[1].split(' | ')[1]
    idx_breaks = []
    
    cats = ['BACKGROUND', 'STRENGTHS', 'WEAKNESSES', 'SUMMARY']
    
    for cat in cats:
        for i, row in enumerate(sr, 0):
            if row == cat:
                idx_breaks.append(i)
                break
                
    idx_breaks = idx_breaks + [len(sr)]
    
    output_dict = {}
    
    for i, cat in enumerate(cats, 0):
        start = idx_breaks[i]
        end = idx_breaks[i+1]
        content = [row.strip('-') for row in sr[start:end] 
                   if row.endswith(' ') or row.endswith('.') or row.endswith('-')]
        content = [row for row in content
                   if re.search(f'^20..-.. {school}', row) is None]
        output_dict[cat] = ''.join(content)

    return output_dict

In [22]:
clean_scouting_report(sr_dict['Andre Jackson'])

{'BACKGROUND': 'Parents are Tricia and Andre Sr. Both of his parents played basketball in college at the community college level. Has two younger siblings. Played at Albany Academy and was wildly successful. Has essentially always been a winner and played winning basketball. Jackson led his school to three straight state title games in New York from 2017 to 2019, winning in 2017 and 2019. Was considered a borderline three-star/four-star recruit until the summer before his senior season, where he played with City Rocks AAU team on the Nike EYBL circuit and played exceedingly well at Peach Jam. That blew him up into a genuinely high-end recruit. His senior season tournament was cancelled due to the COVID-19 pandemic, but his team was again ranked as the  No. 1 team in the state in his classification. He was a do-it-all wing who averaged 18 points, 10 rebounds, five assists and three steals in his senior season while shooting an absurd 68 percent from the field. Was named his classificati

In [23]:
for key in sr_dict:
    sr_dict[key] = clean_scouting_report(sr_dict[key])

## Assemble Data

In [24]:
big_board['is_sr'] = big_board['PLAYER'].apply(lambda x: 1 if x in sr_dict.keys() else 0)
big_board = big_board[big_board['is_sr'] == 1]
big_board.shape

(75, 9)

### Add Scouting Report

In [25]:
cats = ['BACKGROUND', 'STRENGTHS', 'WEAKNESSES', 'SUMMARY']
for cat in cats:
    big_board[cat] = big_board['PLAYER'].apply(lambda x: sr_dict[x][cat])
    
big_board = big_board.drop('is_sr', axis=1)

### Add Tier Description

In [26]:
big_board['TIER_DESCRIP'] = big_board['TIER'].apply(lambda x: tier_dict[f'Tier {x}'])

In [27]:
big_board.head()

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,BACKGROUND,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP
0,1,Victor Wembanyama,Metropolitans 92,C,19,7-4,8-0,Vic,Wembanyama has long been considered one of the...,Wembanyama has elite length for a center. He s...,"There are a few, but not many. Wembanyama has ...",Wembanyama is the highest-upside prospect to e...,Victor Wembanyama
1,2,Scoot Henderson,G League Ignite,G,19,6-2,,1,Real name is Sterling. Went by Scoota at one p...,Elite athlete for the lead guard position. He’...,The real one is that Henderson is a bit small ...,"In many other drafts, Henderson would be the N...",Projectable All-NBA Upside
2,3,Cam Whitmore,Villanova,W,18,6-6,6-9,2,Parents are Beth and Myron. Father was in the ...,"Elite intersection of size, frame, athleticism...",Whitmore has all the physical tools to be an N...,Whitmore is a pure upside play. If you buy int...,Projectable All-Star Upside
3,4,Brandon Miller,Alabama,W/F,20,6-8,,2,Parents are Darrell and Yolanda. Darrell was a...,Great size for a floor-spacing wing at 6-foot-...,The critical concern here is strength in Mille...,It’s hard to look past Miller’s combination of...,Projectable All-Star Upside
4,5,Amen Thompson,Overtime Elite,G,20,6-6,7-0,2,Twin brother of fellow 2023 prospect Ausar Tho...,Amen is the elite of the elite athletically. W...,The critical question here is shooting. Amen h...,Thompson has all the physical tools you could ...,Projectable All-Star Upside


## Save output

In [28]:
big_board.to_csv('../processed_csv/2023.csv', index=False)