In [1]:
import os
import re
import pandas as pd
import numpy as np
from tika import parser

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
os.listdir('../processed_pdf')

['2020.pdf', '2021.pdf', '2023.pdf', '2022.pdf']

## Read Raw Data

In [5]:
raw = parser.from_file('../processed_pdf/2021.pdf')
# print(raw['content'])

In [6]:
raw = raw['content'].split('\n') 

In [7]:
print(len(raw))
raw = [row for row in raw if row != '']
print(len(raw))

9526
7502


## Get Tier

In [8]:
def get_tier(raw):
    tier_dict = {}
    tier_flag = 0
    
    for i, row in enumerate(raw, 0):
        if row.startswith('Tier'):
            key, value = row.split(': ')
            tier_dict[key] = value
            tier_flag = 1
        else:
            if tier_flag == 1:
                return tier_dict, raw[i:]

In [9]:
tier_dict, raw = get_tier(raw)

In [10]:
tier_dict

{'Tier 1': 'Superstar Upside',
 'Tier 2': 'All-Star Upside',
 'Tier 3': 'High Leverage Starters',
 'Tier 4': 'Potential Starters',
 'Tier 5': 'Rotation Players',
 'Tier 6': '2nd Rd. Fliers ',
 'Tier 7': 'Undrafted Fliers ',
 'Tier 8': 'Exhibit 10'}

## Get Big Board

In [11]:
def get_raw_big_board(raw):
    raw_bb_dict = {}
    bb_flag = 0
    bb_data = []
    
    for i, row in enumerate(raw, 0):
        if row.startswith('RANK'):
            raw_bb_dict['columns'] = row
            bb_flag = 1
        else:
            if bb_flag == 1:
                if row.split(' ')[0] != '100':
                    if row != '2 0 2 1  N B A  D R A F T  G U I D E 4 ':
                        bb_data.append(row)
                else:
                    bb_data.append(row)
                    raw_bb_dict['data'] = bb_data
                    return raw_bb_dict, raw[i+1:]

In [12]:
raw_bb_dict, raw = get_raw_big_board(raw)

In [13]:
def adjust_name(n):
    comps = n.strip().split(' ')
    adj_comps = [f'{comp[0]}{comp[1:].lower()}' for comp in comps]
    return ' '.join(adj_comps)

In [14]:
def build_big_board(raw_bb_dict):
    columns = [col.replace('.', '').replace(',', '')
               for col in raw_bb_dict['columns'].split(' ')]
    columns = [col for col in columns if col != '']
    
    data = [row for row in raw_bb_dict['data']
            if len(row.split(' ')[0]) <= 3]
    
    processed_data = []
    for row in data:
        front, back = row.split(', ')
        back = back.strip()
        row_adj = [front.split(' ')[0]] + [' '.join(front.split(' ')[1:])] + \
                  [' '.join(back.split(' ')[:-5])] + back.split(' ')[-5:]
        row_adj[1] = adjust_name(row_adj[1]) # adjust players' names
        row_adj[2] = adjust_name(row_adj[2]) # adjust school names
        processed_data.append(row_adj)
    
    return pd.DataFrame(processed_data, columns=columns)

In [15]:
big_board = build_big_board(raw_bb_dict)

In [16]:
# Minor Edit
names_dict = {'Jeremiah Robinson-earl': 'Jeremiah Robinson-Earl',
              "Nah'shon “bones” Hyland": "Nah’Shon “Bones” Hyland",
              "Day'ron Sharpe": "Day’Ron Sharpe",
              'B.j. Boston': 'B.J. Boston',
              'Jt Thor': 'JT Thor',
              'Ej Onu': 'EJ Onu',
              'Dj Steward': 'DJ Steward',
              'Aj Lawson': 'A.J. Lawson',
              'D.j. Stewart Jr.': 'D.J. Stewart Jr.',
              'Rj Nembhard': 'RJ Nembhard',
              'M.j. Walker': 'M.J. Walker',
              'Jaquori Mclaughlin': 'JaQuori McLaughlin',
              'Macio Teague': 'MaCio Teague', 
              'Dejon Jarreau': 'DeJon Jarreau',
              'Raiquan Gray' :'RaiQuan Gray', 
              'Mckinley Wright Iv': 'McKinley Wright IV',
              'Miles Mcbride': 'Miles McBride',
              'Trey Murphy Iii': 'Trey Murphy III'}
big_board['PLAYER'] = big_board['PLAYER'].apply(lambda x: names_dict.get(x, x))

In [17]:
schools_dict = {'Tcu': 'TCU', 'Lsu': 'LSU', 'Vcu': 'VCU',
                'Usc': 'USC', 'Uc Santa Barbara': 'UC Santa Barbara',
                'Ucla': 'UCLA', 'Loyola (md)': 'Loyola (MD)'}
big_board['SCHOOL/TEAM'] = big_board['SCHOOL/TEAM'].apply(lambda x: schools_dict.get(x, x))

In [18]:
big_board

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER
0,1,Cade Cunningham,Oklahoma State,G/W,19,6-8,7-1,1
1,2,Jalen Suggs,Gonzaga,G,20,6-4,6-6,2
2,3,Evan Mobley,USC,C,20,7-0,7-4,2
3,4,Jalen Green,G League Ignite,G,19,6-5,6-8,2
4,5,Jonathan Kuminga,G League Ignite,W,18,6-8,7-0,3
5,6,Scottie Barnes,Florida State,F,19,6-9,7-3,3
6,7,Moses Moody,Arkansas,W,19,6-6,7-1,4
7,8,Alperen Sengun,Besiktas,C,19,6-10,7-0,4
8,9,James Bouknight,Connecticut,W,20,6-5,6-8,4
9,10,Davion Mitchell,Baylor,G,22,6-1,6-4,4


## Get Scouting Report

### Build Raw Dict

In [19]:
print(len(raw))
raw = [row.replace('\t', ' ') for row in raw if row != ' ']
print(len(raw))

7306
6088


In [20]:
def identify_scouting_report(raw, big_board):
    sr_dict = {}
    idx_breaks = []
    players_identified = []

    for player in big_board.PLAYER.values:
        for i, row in enumerate(raw, 0):
            if row.strip().endswith(player):
                if len(idx_breaks) == 0:
                    idx_breaks.append(i)
                    players_identified.append(player)
                    break
                if i > idx_breaks[-1]:
                    idx_breaks.append(i)
                    players_identified.append(player)
                    break
    
    idx_breaks = idx_breaks + [len(raw)]

    for i, player in enumerate(players_identified, 0):
        start = idx_breaks[i]
        end = idx_breaks[i+1]
        sr_dict[player] = [row.lstrip(' ') for row in raw[start:end]]
        
    # report
    print(f'# of players identified: {len(players_identified)}')
    
    return sr_dict

In [21]:
sr_dict = identify_scouting_report(raw, big_board)

# of players identified: 100


In [22]:
def check_categories(sc):
    n_err = 0
    for cat in ['Background', 'Strengths', 'Weaknesses', 'Summary']:
        if cat not in [row.strip() for row in sc]:
            print(cat)
            n_err += 1
    return n_err

In [23]:
# Minor Edit
sr_dict['Scottie Barnes'][2] = 'Background '

In [24]:
players_removed = []

for player in sr_dict.keys():
    if check_categories(sr_dict[player]) >= 1:
        print(player)
        players_removed.append(player)
        print()

Strengths
Weaknesses
EJ Onu

Strengths
Weaknesses
Eugene Omoruyi

Strengths
Weaknesses
A.J. Lawson

Strengths
Weaknesses
Marcus Zegarowski

Strengths
Weaknesses
Jose Alvarado

Strengths
Weaknesses
Derrick Alston

Strengths
Weaknesses
Jordan Schakel

Strengths
Weaknesses
D.J. Stewart Jr.

Strengths
Weaknesses
RJ Nembhard

Strengths
Weaknesses
Mitch Ballock

Strengths
Weaknesses
Matt Mitchell

Strengths
Weaknesses
M.J. Walker

Strengths
Weaknesses
Chaundee Brown

Strengths
Weaknesses
Duane Washington Jr.

Strengths
Weaknesses
Javonte Smart

Strengths
Weaknesses
MaCio Teague

Strengths
Weaknesses
DeJon Jarreau

Strengths
Weaknesses
Arnas Velicka

Strengths
Weaknesses
JaQuori McLaughlin

Strengths
Weaknesses
Dru Smith

Strengths
Weaknesses
Ethan Thompson

Strengths
Weaknesses
Amar Sylla



In [25]:
# Remove those players
big_board = big_board[~big_board['PLAYER'].isin(players_removed)].reset_index()\
                                                                 .drop('index', axis=1)
big_board

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER
0,1,Cade Cunningham,Oklahoma State,G/W,19,6-8,7-1,1
1,2,Jalen Suggs,Gonzaga,G,20,6-4,6-6,2
2,3,Evan Mobley,USC,C,20,7-0,7-4,2
3,4,Jalen Green,G League Ignite,G,19,6-5,6-8,2
4,5,Jonathan Kuminga,G League Ignite,W,18,6-8,7-0,3
5,6,Scottie Barnes,Florida State,F,19,6-9,7-3,3
6,7,Moses Moody,Arkansas,W,19,6-6,7-1,4
7,8,Alperen Sengun,Besiktas,C,19,6-10,7-0,4
8,9,James Bouknight,Connecticut,W,20,6-5,6-8,4
9,10,Davion Mitchell,Baylor,G,22,6-1,6-4,4


In [26]:
# Remove those players
print(len(sr_dict))
for player in players_removed:
    del sr_dict[player]
print(len(sr_dict))

100
78


### Clean Scouting Report

In [27]:
def clean_scouting_report(sr):
    school = sr[1].split(' | ')[1]
    idx_breaks = []
    
    cats = ['Strengths', 'Weaknesses', 'Summary']
    
    for cat in cats:
        for i, row in enumerate(sr, 0):
            if row.strip() == cat:
                idx_breaks.append(i)
                break
                
    idx_breaks = idx_breaks + [len(sr)]
    
    output_dict = {}
    
    for i, cat in enumerate(cats, 0):
        start = idx_breaks[i]
        end = idx_breaks[i+1]
        content = [row.strip('-') for row in sr[start:end] 
                   if row.endswith(' ') or row.endswith('.') or row.endswith('-')]
        content = [row for row in content
                   if re.search(f'^20..-.. {school}', row) is None 
                   and re.search('^20..-..', row) is None
                   and row.strip(' ') != 'NCAA']
        mask = [row for row in content 
                if row.startswith('2 0 2 1  N B A  D R A F T  G U I D E')
                or row.strip().isdigit()]
        content = [row for row in content if row not in mask]
        output_dict[cat] = ''.join(content[1:])

    return output_dict

In [28]:
clean_scouting_report(sr_dict['Sandro Mamukelashvili'])

{'Strengths': 'Versatile offensive player. Reputation coming into his senior year was as a shooter after the gaudy 43.4 percent field goal number as a sophomore, but he’s much more interesting as a passer and playmaker. Real grab-and-go threat off the defensive glass. Loves to push the pace in transition and has the fluidity to do so. Really covers ground quickly and makes quick reads. Processes the game at a high level. Knows how to drive and kick or find players filling transition lanes. Just plays with a laughable amount of confidence in his skills. Throws some absolutely ridiculous one-handed live-dribble passes both in the full court and half court. Will post up and throw cross-corner kickouts. Great at finding cutters from the top of the key. No look passes to cutters at the rim, creative dump-offs as a driver, bounce passes, frozen ropes to the corners, touch passes to the dunker spot. Seton Hall basically just gave him the ball in a variety of situations and said to make plays.

In [29]:
for key in sr_dict:
    sr_dict[key] = clean_scouting_report(sr_dict[key])

## Assemble Data

### Add Scouting Report

In [30]:
cats = ['Strengths', 'Weaknesses', 'Summary']
for cat in cats:
    big_board[cat] = big_board['PLAYER'].apply(lambda x: sr_dict[x][cat])
    
big_board = big_board.rename(columns={cat: cat.upper() for cat in cats})

### Add Tier Description

In [31]:
big_board['TIER_DESCRIP'] = big_board['TIER'].apply(lambda x: tier_dict[f'Tier {x}'])

In [32]:
big_board.head()

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP
0,1,Cade Cunningham,Oklahoma State,G/W,19,6-8,7-1,1,Cunningham has elite-level size as a lead crea...,"Cunningham is a good athlete, but not a great ...",Cunningham is one of the more complete prospec...,Superstar Upside
1,2,Jalen Suggs,Gonzaga,G,20,6-4,6-6,2,Ideal size for a lead guard creator. Powerful ...,Sometimes gets a bit wild. Turned it over thre...,Few things are more valuable in the NBA than a...,All-Star Upside
2,3,Evan Mobley,USC,C,20,7-0,7-4,2,Terrific physical tools for the modern center ...,"Mobley is still very skinny, and his frame isn...",Mobley has all the tools to be an All-NBA big ...,All-Star Upside
3,4,Jalen Green,G League Ignite,G,19,6-5,6-8,2,The most explosive athlete of this class. All ...,Two big questions. First comes on defense. Gre...,I was really impressed with what I saw from Gr...,All-Star Upside
4,5,Jonathan Kuminga,G League Ignite,W,18,6-8,7-0,3,Ideal measurements for a big wing initiator ty...,Kuminga is not a good defender right now. It’s...,I expected Kuminga to be the best of the group...,High Leverage Starters


## Save output

In [33]:
big_board.to_csv('../processed_csv/2021.csv', index=False)