In [1]:
import os
import re
import pandas as pd
import numpy as np
from tika import parser

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
os.listdir('../processed_pdf')

['2020.pdf', '2021.pdf', '2023.pdf', '2022.pdf']

## Read Raw Data

In [5]:
raw = parser.from_file('../processed_pdf/2022.pdf')
# print(raw['content'])

In [6]:
raw = raw['content'].split('\n') 

In [7]:
print(len(raw))
raw = [row for row in raw if row != '']
print(len(raw))

6983
5060


## Get Tier

In [8]:
def get_tier(raw):
    tier_dict = {}
    tier_flag = 0
    
    for i, row in enumerate(raw, 0):
        if row.startswith('Tier'):
            key, value = row.split(': ')
            tier_dict[key] = value
            tier_flag = 1
        else:
            if tier_flag == 1 and row != '2022NBA DRAFT GUIDE':
                return tier_dict, raw[i:]

In [9]:
tier_dict, raw = get_tier(raw)

In [10]:
tier_dict

{'Tier 2': 'All-Star Upside',
 'Tier 3': 'High-Leverage Starters',
 'Tier 4': 'Starters and Difference-Makers',
 'Tier 5': 'Late First-Round, Guaranteed Contract Guys',
 'Tier 6': 'Two-Way Contracts'}

## Get Big Board

In [11]:
def get_raw_big_board(raw):
    raw_bb_dict = {}
    bb_flag = 0
    bb_data = []
    
    for i, row in enumerate(raw, 0):
        if row.startswith('RANK'):
            raw_bb_dict['columns'] = row
            bb_flag = 1
        else:
            if bb_flag == 1:
                if row.split(' ')[0] != '100':
                    bb_data.append(row)
                else:
                    bb_data.append(row)
                    raw_bb_dict['data'] = bb_data
                    return raw_bb_dict, raw[i+1:]

In [12]:
raw_bb_dict, raw = get_raw_big_board(raw)

In [13]:
# Minor Edit
raw_bb_dict['data'][-2] = '99 Justin Bean, Utah State F 25 6-7 N/A 7'
raw_bb_dict['data'][-1] = '100 Tyson Etienne, Wichita State G 22 6-2 N/A 7'

In [14]:
def build_big_board(raw_bb_dict):
    columns = [col.replace('.', '').replace(',', '')
               for col in raw_bb_dict['columns'].split(' ')]
    
    data = [row for row in raw_bb_dict['data']
            if len(row.split(' ')[0]) <= 3]
    
    processed_data = []
    for row in data:
        front, back = row.split(', ')
        row_adj = [front.split(' ')[0]] + [' '.join(front.split(' ')[1:])] + \
                  [' '.join(back.split(' ')[:-5])] + back.split(' ')[-5:]
        processed_data.append(row_adj)
    
    return pd.DataFrame(processed_data, columns=columns)

In [15]:
big_board = build_big_board(raw_bb_dict)
big_board

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER
0,1,Chet Holmgren,Gonzaga,C,20,7-0,7-6,2
1,2,Jabari Smith Jr.,Auburn,F,19,6-10,7-1,2
2,3,Paolo Banchero,Duke,F,19,6-10,7-1,2
3,4,Jaden Ivey,Purdue,G,20,6-4,,2
4,5,Bennedict Mathurin,Arizona,W,20,6-6,6-9,3
5,6,Jeremy Sochan,Baylor,W/F,19,6-9,7-0,3
6,7,Dyson Daniels,G League Ignite,G,19,6-7,6-11,3
7,8,Keegan Murray,Iowa,F,21,6-8,6-11,3
8,9,Shaedon Sharpe,Kentucky,W,19,6-5,7-0,3
9,10,Johnny Davis,Wisconsin,G,20,6-6,6-9,4


In [16]:
big_board['PLAYER'] = big_board['PLAYER'].apply(lambda x: 'Dominick Barlow' if x == 'Dom Barlow' else x)

## Get Scouting Report

### Build Raw Dict

In [17]:
raw = [row.replace('\t', ' ') for row in raw]

In [18]:
def identify_scouting_report(raw, big_board):
    sr_dict = {}
    idx_breaks = []
    players_identified = []

    for player in big_board.PLAYER.values:
        for i, row in enumerate(raw, 0):
            if row.endswith(player):
                idx_breaks.append(i)
                players_identified.append(player)
                break
    
    idx_breaks = idx_breaks + [len(raw)]

    for i, player in enumerate(players_identified, 0):
        start = idx_breaks[i]
        end = idx_breaks[i+1]
        sr_dict[player] = [row.lstrip(' ') for row in raw[start:end]]
        
    # report
    print(f'# of players identified: {len(players_identified)}')
    
    return sr_dict

In [57]:
sr_dict = identify_scouting_report(raw, big_board)

# of players identified: 75


In [58]:
def check_categories(sc):
    n_err = 0
    for cat in ['STRENGTHS', 'WEAKNESSES', 'SUMMARY']:
        if cat not in sc:
            print(cat)
            n_err += 1
    return n_err

In [59]:
for player in sr_dict.keys():
    if check_categories(sr_dict[player]) >= 1:
        print(player)

### Clean Scouting Report

In [60]:
def clean_scouting_report(sr):
    school = sr[1].split(' | ')[1]
    idx_breaks = []
    
    cats = ['STRENGTHS', 'WEAKNESSES', 'SUMMARY']
    
    for cat in cats:
        for i, row in enumerate(sr, 0):
            if row == cat:
                idx_breaks.append(i)
                break
                
    idx_breaks = idx_breaks + [len(sr)]
    
    output_dict = {}
    
    for i, cat in enumerate(cats, 0):
        start = idx_breaks[i]
        end = idx_breaks[i+1]
        content = [row.strip('-') for row in sr[start:end] 
                   if row.endswith(' ') or row.endswith('.') or row.endswith('-')]
        content = [row for row in content
                   if re.search(f'^20..-.. {school}', row) is None 
                   and re.search('^20..-..', row) is None
                   and row.strip(' ') != 'NCAA']
        output_dict[cat] = ''.join(content)

    return output_dict

In [61]:
clean_scouting_report(sr_dict['Jeremy Sochan'])

{'STRENGTHS': 'Very well-traveled in his youth. Sochan was born in Oklahoma to an American father and a Polish mother and grew up in England. Played at La Lumiere for high school and was considered a terrific prospect, but during the pandemic, he decided to leave the United States and went to play with a German team Orange Academy. Committed to Baylor as a slight mystery because of that but exploded on the scene as one of the best defensive prospects in the country. Won the Big 12 Sixth Man of the Year award and made the Big-12 All-Freshman team. Has represented the Polish national team at youth and senior levels.Great size for a versatile wing forward at 6-foot-9 with a 7-foot-plus wingspan. Very coordinated athlete with great lateral speed and quickness. Light on his feet. The term “functional” comes to mind. Not wildly explosive off two feet but has some twitch and moves well.Where Sochan’s athletic gifts show up best right now is on defense. Has more upside defensively than any non

In [62]:
for key in sr_dict:
    sr_dict[key] = clean_scouting_report(sr_dict[key])

## Assemble Data

In [65]:
big_board['is_sr'] = big_board['PLAYER'].apply(lambda x: 1 if x in sr_dict.keys() else 0)
big_board = big_board[big_board['is_sr'] == 1]
big_board.shape

(75, 9)

### Add Scouting Report

In [67]:
cats = ['STRENGTHS', 'WEAKNESSES', 'SUMMARY']
for cat in cats:
    big_board[cat] = big_board['PLAYER'].apply(lambda x: sr_dict[x][cat])
    
big_board = big_board.drop('is_sr', axis=1)

### Add Tier Description

In [68]:
big_board['TIER_DESCRIP'] = big_board['TIER'].apply(lambda x: tier_dict[f'Tier {x}'])

In [69]:
big_board.head()

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP
0,1,Chet Holmgren,Gonzaga,C,20,7-0,7-6,2,The consensus top recruit in the 2021 recruiti...,"End of the day, it basically comes down to one...",The word “unique” is overused in scouting circ...,All-Star Upside
1,2,Jabari Smith Jr.,Auburn,F,19,6-10,7-1,2,"Elite consensus five-star, and a top-10 recrui...",While Smith is extremely functional athletical...,Much like the other two prospects in the class...,All-Star Upside
2,3,Paolo Banchero,Duke,F,19,6-10,7-1,2,"A clear five-star, top-five recruit in the cou...",More of a fluid athlete than an explosive one....,Where you are on Banchero depends entirely on ...,All-Star Upside
3,4,Jaden Ivey,Purdue,G,20,6-4,,2,Was a consensus four-star recruit who was a to...,I don’t think I’d call Ivey a negative in term...,Ivey is all about how much you value athletic ...,All-Star Upside
4,5,Bennedict Mathurin,Arizona,W,20,6-6,6-9,3,Originally from Montreal and played basketball...,A lot of rawness in his game on both ends. The...,There is a high floor for Mathurin because of ...,High-Leverage Starters


## Save output

In [70]:
big_board.to_csv('../processed_csv/2022.csv', index=False)