In [1]:
import os
import re
import pandas as pd
import numpy as np
from tika import parser

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
os.listdir('../processed_pdf')

['2020.pdf', '2021.pdf', '2023.pdf', '2022.pdf']

## Read Raw Data

In [5]:
raw = parser.from_file('../processed_pdf/2020.pdf')
# print(raw['content'])

In [6]:
raw = raw['content'].split('\n') 

In [7]:
print(len(raw))
raw = [row for row in raw if row != '']
print(len(raw))

5646
4200


## Get Tier

In [8]:
def get_tier(raw):
    tier_dict = {}
    tier_flag = 0
    tier_start = 1
    
    for i, row in enumerate(raw, 0):
        if row.startswith('TIER'):
            key, value = row.split(': ')
            tier_dict[f'TIER {tier_start}'] = value.split('…')[0]
            tier_flag = 1
            tier_start += 1
        else:
            if tier_flag == 1:
                return tier_dict, raw[i:]

In [9]:
tier_dict, raw = get_tier(raw)

In [10]:
tier_dict

{'TIER 1': 'VERY REAL SUPERSTAR UPSIDE',
 'TIER 2': 'REALIZABLE ALL-STAR UPSIDE',
 'TIER 3': 'HIGH LEVERAGE STARTERS',
 'TIER 4': 'UPSIDE SWINGS, POTENTIAL STARTERS ',
 'TIER 5': 'ROTATION PLAYERS',
 'TIER 6': 'SECOND ROUND FLIERS AND PRIORITY TWO WAYS',
 'TIER 7': 'LOWER TIER TWO WAYS AND UNDRAFTED FLIERS'}

## Get Big Board

In [11]:
def get_raw_big_board(raw):
    raw_bb_dict = {}
    bb_flag = 0
    bb_data = []
    
    for i, row in enumerate(raw, 0):
        if row.startswith('Rank'):
            raw_bb_dict['columns'] = row
            bb_flag = 1
        else:
            if bb_flag == 1:
                if row.split(' ')[0] != '100':
                    if ' 4 ' not in row:
                        bb_data.append(row)
                else:
                    bb_data.append(row)
                    raw_bb_dict['data'] = bb_data
                    return raw_bb_dict, raw[i+1:]

In [12]:
raw_bb_dict, raw = get_raw_big_board(raw)

In [13]:
def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start+len(needle))
        n -= 1
    return start

In [14]:
data_adj = []

for row in raw_bb_dict['data']:
    idx = find_nth(row, ' ', 3)
    row_adj = row[:idx] + ', ' + row[idx+1:]
    data_adj.append(row_adj.strip())
    
# Minor Edit
data_adj[11] = '12 Kira Lewis Jr., Alabama G 19 6-3 175 6-6 20'

# Update
raw_bb_dict['data'] = data_adj

In [15]:
def build_big_board(raw_bb_dict):
    columns = [col.replace('.', '').replace(',', '')
               for col in raw_bb_dict['columns'].strip().split(' ')][:-2]
    
    data = [row for row in raw_bb_dict['data']
            if len(row.split(' ')[0]) <= 3]
    
    processed_data = []
    for row in data:
        front, back = row.split(', ')
        row_adj = [front.split(' ')[0]] + [' '.join(front.split(' ')[1:])] + \
                  [' '.join(back.split(' ')[:-6])] + back.split(' ')[-6:]
        processed_data.append(row_adj)
    
    return pd.DataFrame(processed_data, columns=columns)

In [16]:
big_board = build_big_board(raw_bb_dict)

In [17]:
tier_breaks = ['LaMelo Ball', 'Onyeka Okongwu', 'Kira Lewis Jr.',
               'Tre Jones', 'Ty-Shon Alexander', 'Nate Knight', '']

tier_rec = []
tier = 1
idx = 0

for player in big_board.Player.values:
    if player == tier_breaks[idx]:
        idx += 1
        tier += 1
        tier_rec.append(tier)
    else:
        tier_rec.append(tier)
        
big_board['Tier'] = tier_rec

In [18]:
big_board = big_board.rename(columns={col:col.upper() for col in big_board.columns})\
                     .drop('SCOUTING', axis=1)
big_board

Unnamed: 0,RANK,PLAYER,TEAM,POSITION,AGE,HEIGHT,WEIGHT,WINGSPAN,TIER
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,175,6-9,2
1,2,James Wiseman,Memphis,C,19,7-1,245,7-6,2
2,3,Anthony Edwards,Georgia,G,19,6-5,225,6-10,2
3,4,Onyeka Okongwu,USC,F/C,19,6-9,245,7-1,3
4,5,Isaac Okoro,Auburn,W,19,6-6,225,6-9,3
5,6,Deni Avdija,Maccabi Tel Aviv,W/F,19,6-9,225,6-10,3
6,7,Tyrese Haliburton,Iowa State,G,20,6-5,175,6-7,3
7,8,Obi Toppin,Dayton,F/C,22,6-9,230,,3
8,9,Patrick Williams,Florida State,F,19,6-8,225,6-11,3
9,10,Killian Hayes,Ratiopharm Ulm,G,19,6-5,215,6-6,3


## Get Scouting Report

In [19]:
raw = [row.replace('\t', ' ') for row in raw]

In [20]:
def identify_scouting_report(raw, big_board):
    sr_dict = {}
    idx_breaks = []
    players_identified = []

    for player in big_board.PLAYER.values:
        for i, row in enumerate(raw, 0):
            if re.search(f'. {player}', row):
                if len(idx_breaks) == 0:
                    idx_breaks.append(i)
                    players_identified.append(player)
                    break
                if i > idx_breaks[-1]:
                    idx_breaks.append(i)
                    players_identified.append(player)
                    break
    
    idx_breaks = idx_breaks + [len(raw)]

    for i, player in enumerate(players_identified, 0):
        start = idx_breaks[i]
        end = idx_breaks[i+1]
        sr_dict[player] = [row.lstrip(' ') for row in raw[start:end]]
        
    # report
    print(f'# of players identified: {len(players_identified)}')
    
    return sr_dict

In [21]:
sr_dict = identify_scouting_report(raw, big_board)

# of players identified: 100


In [22]:
def check_categories(sc):
    n_err = 0
    for cat in ['STRENGTHS', 'WEAKNESSES', 'SUMMARY', 'GRADE']:
        if cat not in ' '.join(sc):
            print(cat)
            n_err += 1
    return n_err

In [23]:
players_removed = []

for player in sr_dict.keys():
    if check_categories(sr_dict[player]) >= 1:
        print(player)
        players_removed.append(player)
        print()

STRENGTHS
WEAKNESSES
SUMMARY
Saben Lee

STRENGTHS
WEAKNESSES
SUMMARY
Nate Hinton

STRENGTHS
WEAKNESSES
SUMMARY
Kenyon Martin

STRENGTHS
WEAKNESSES
SUMMARY
Abdoulaye N’Doye

STRENGTHS
WEAKNESSES
SUMMARY
Nate Knight

STRENGTHS
WEAKNESSES
SUMMARY
Trevelin Queen

STRENGTHS
WEAKNESSES
SUMMARY
Nick Richards

STRENGTHS
WEAKNESSES
SUMMARY
Marko Simonovic

STRENGTHS
WEAKNESSES
SUMMARY
Trent Forrest

STRENGTHS
WEAKNESSES
SUMMARY
Jalen Harris

STRENGTHS
WEAKNESSES
SUMMARY
Kaleb Wesson

STRENGTHS
WEAKNESSES
SUMMARY
Myles Powell

STRENGTHS
WEAKNESSES
SUMMARY
Jordan Ford

STRENGTHS
WEAKNESSES
SUMMARY
Nate Darling

STRENGTHS
WEAKNESSES
SUMMARY
Malik Fitts

STRENGTHS
WEAKNESSES
SUMMARY
Lamine Diane

STRENGTHS
WEAKNESSES
SUMMARY
Ashton Hagans

STRENGTHS
WEAKNESSES
SUMMARY
Paul Eboua

STRENGTHS
WEAKNESSES
SUMMARY
Josh Hall

STRENGTHS
WEAKNESSES
SUMMARY
Kristian Doolittle

STRENGTHS
WEAKNESSES
SUMMARY
Jake Toolson

STRENGTHS
WEAKNESSES
SUMMARY
Freddie Gillespie

STRENGTHS
WEAKNESSES
SUMMARY
Mamadi Diakit

In [24]:
# Remove those players
big_board = big_board[~big_board['PLAYER'].isin(players_removed)].reset_index()\
                                                                 .drop('index', axis=1)
big_board

Unnamed: 0,RANK,PLAYER,TEAM,POSITION,AGE,HEIGHT,WEIGHT,WINGSPAN,TIER
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,175,6-9,2
1,2,James Wiseman,Memphis,C,19,7-1,245,7-6,2
2,3,Anthony Edwards,Georgia,G,19,6-5,225,6-10,2
3,4,Onyeka Okongwu,USC,F/C,19,6-9,245,7-1,3
4,5,Isaac Okoro,Auburn,W,19,6-6,225,6-9,3
5,6,Deni Avdija,Maccabi Tel Aviv,W/F,19,6-9,225,6-10,3
6,7,Tyrese Haliburton,Iowa State,G,20,6-5,175,6-7,3
7,8,Obi Toppin,Dayton,F/C,22,6-9,230,,3
8,9,Patrick Williams,Florida State,F,19,6-8,225,6-11,3
9,10,Killian Hayes,Ratiopharm Ulm,G,19,6-5,215,6-6,3


In [25]:
# Remove those players
print(len(sr_dict))
for player in players_removed:
    del sr_dict[player]
print(len(sr_dict))

100
60


### Clean Scouting Report

In [26]:
def clean_scouting_report(sr):
    school = sr[0].split(' | ')[2]
    idx_breaks = []
    
    cats = ['STRENGTHS', 'WEAKNESSES', 'SUMMARY', 'GRADE']
    
    for cat in cats:
        for i, row in enumerate(sr, 0):
            if row.startswith(f'{cat}:'):
                idx_breaks.append(i)
                break
                
    idx_breaks = idx_breaks + [len(sr)]
    
    output_dict = {}
    
    for i, cat in enumerate(cats, 0):
        start = idx_breaks[i]
        end = idx_breaks[i+1]
        content = [row.strip('-') for row in sr[start:end] 
                   if row.endswith(' ') or row.endswith('.') or row.endswith('-')]
        content = [row for row in content
                   if re.search(f'^20..-.. {school}', row) is None 
                   and re.search('^20..-..', row) is None
                   and row.strip(' ') != 'NCAA']
        mask = [row for row in content if row.strip().isdigit()]
        content = [row for row in content if row not in mask]
        content[0] = content[0].replace(f'{cat}:', '').lstrip()
        output_dict[cat] = ''.join(content)

    return output_dict

In [27]:
clean_scouting_report(sr_dict['Tyrese Haliburton'])

{'STRENGTHS': 'His feel for the game is just absolutely off the charts. Has a case for having the highest basketball IQ in this draft class. Makes the right decision all the time. Keeps the offense in flow. That starts as a ballhandler. Haliburton is an extremely high-level passer out of all situations. Great with head-man passes in transition. Sees everything and plays unselfishly. The game is not about his numbers despite the fact that he put up great numbers this year. Despite not exactly being a high-level pick-and-roll scorer, he can make every pass in the book as a pick-and-roll passer. He reads the defense exceptionally well. Takes advantage of exactly what he’s presented and has an incredible internal process for finding the right decision. His poise is remarkable for a guy who played both of his seasons essentially as a teenager. There’s a real patience and calm to everything he does. Defenses can’t really speed him up and his size for the position makes it tougher to defend h

In [28]:
for key in sr_dict:
    sr_dict[key] = clean_scouting_report(sr_dict[key])

In [29]:
sr_dict['LaMelo Ball']

{'STRENGTHS': 'Everything starts with Ball’s elite-level feel for the game. He sees and understands the game in a way that few teenagers do. That displays itself most in his passing ability. Ball is tremendous at reading the second and third levels of the defense and making anticipatory passing reads based off how those defenders play in help. He sees nearly every pass that is available, and on top of it has the talent to execute said passes from a wide variety of angles. He’s an elite-level live-dribble passer. He can throw one-handed whip passes from either hand. His ability to hit the cross-corner kickout will translate well given how available that pass tends to be in the NBA. He can hit pocket bounce passes to rollers and weights lobs perfectly. Additionally, despite a relatively wild style of play, Ball tends to relatively limit turnovers because of that basketball IQ and the way that he can control the ball.  Indeed, Ball is a tremendous ballhandler who can create at an extremel

## Assemble Data

### Add Scouting Report

In [30]:
cats = ['STRENGTHS', 'WEAKNESSES', 'SUMMARY']
for cat in cats:
    big_board[cat] = big_board['PLAYER'].apply(lambda x: sr_dict[x][cat])
    
big_board = big_board.rename(columns={cat: cat.upper() for cat in cats})

### Add Tier Description

In [31]:
tier_dict

{'TIER 1': 'VERY REAL SUPERSTAR UPSIDE',
 'TIER 2': 'REALIZABLE ALL-STAR UPSIDE',
 'TIER 3': 'HIGH LEVERAGE STARTERS',
 'TIER 4': 'UPSIDE SWINGS, POTENTIAL STARTERS ',
 'TIER 5': 'ROTATION PLAYERS',
 'TIER 6': 'SECOND ROUND FLIERS AND PRIORITY TWO WAYS',
 'TIER 7': 'LOWER TIER TWO WAYS AND UNDRAFTED FLIERS'}

In [32]:
big_board['TIER_DESCRIP'] = big_board['TIER'].apply(lambda x: tier_dict[f'TIER {x}'])

In [33]:
big_board.head()

Unnamed: 0,RANK,PLAYER,TEAM,POSITION,AGE,HEIGHT,WEIGHT,WINGSPAN,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,175,6-9,2,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,REALIZABLE ALL-STAR UPSIDE
1,2,James Wiseman,Memphis,C,19,7-1,245,7-6,2,Physical tools are off the charts. Has a 7-foo...,Has done a great job working on his physical s...,Wiseman is a prospect that has elite skills an...,REALIZABLE ALL-STAR UPSIDE
2,3,Anthony Edwards,Georgia,G,19,6-5,225,6-10,2,Utterly elite physical tools for the shooting ...,I’m not entirely sure where to put Edwards’ sh...,This is all about tools in the case of Edwards...,REALIZABLE ALL-STAR UPSIDE
3,4,Onyeka Okongwu,USC,F/C,19,6-9,245,7-1,3,All starts with his mentality. This dude plays...,"While he’s effective in the roles noted above,...",It’s hard to find centers who can close games ...,HIGH LEVERAGE STARTERS
4,5,Isaac Okoro,Auburn,W,19,6-6,225,6-9,3,The place to start is he just makes teams bett...,The jump shot is kind of a mess. It’s inconsis...,"Ultimately, everything hinges on the shot. If ...",HIGH LEVERAGE STARTERS


## Save output

In [34]:
big_board.to_csv('../processed_csv/2020.csv', index=False)