In [None]:
import pandas as pd
import seaborn as sns
import scipy.stats as st
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [None]:
#Extract Combine data from pro-football reference for years 2009 to 2020
years = ['2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
dfs=[]
for yr in years:
    #Adjust URL based on years fed in
    url = f"https://www.pro-football-reference.com/play-index/nfl-combine-results.cgi?request=1&year_min={yr}&year_max={yr}&height_min=60&height_max=82&weight_min=120&weight_max=400&pos%5B%5D=WR&show=all&order_by=year_id"
    try:
        #read table on page into a dataframe
        df = pd.read_html(url)
        #drop AV column
        new_df = df[0].drop(columns="AV")
        
        #save dataframe for potential reference later
        dfs.append(new_df)
        
        #Output success message
        print(f"success: {yr}")
        
        #Create first dataframe and then add subsequent dataframes
        if len(dfs)==1:
            comb_df = new_df
        else:
            comb_df = pd.concat([comb_df,new_df])
                
    except: 
        #Notification of years that couldn't be read in
        print(yr)

In [None]:
#Split out draft info 
comb_df[["Team","Round","pickNum","year"]] = comb_df["Drafted (tm/rnd/yr)"].str.split("/",n=3,expand=True)

#Extract just the number for pickNum and Round
comb_df["pickNum"] = comb_df["pickNum"].str.extract('(\d+)')
comb_df["Round"] = comb_df["Round"].str.extract('(\d+)')

#Get rid of combine results of those not drafted
comb_df["pickNum"].fillna("Not drafted",inplace = True)
drafteddf = comb_df[comb_df["pickNum"] != "Not drafted"]

#Remove header columns
drafteddf=drafteddf[drafteddf["Year"] != "Year" ]


#Remove columns that were split out and College stats link column
combine= drafteddf.drop(columns = ['Drafted (tm/rnd/yr)','College'])


#Convert Height in Feet and inches to inches
combine[['hfeet','hinches']] = combine['Height'].str.split("-",n=1,expand=True)
combine[['hfeet','hinches']] = combine[['hfeet','hinches']].astype(int)
combine['height(in)']=12*combine['hfeet']+combine['hinches']
combine.drop(columns = ['Height','hfeet','hinches'], inplace = True)

#Separate out players first and last name for joining later
combine[["FirstNm","Last"]] = combine["Player"].str.split(" ",n=1,expand=True)
combine['FirstNm']=combine['FirstNm'].str.upper()
combine['Last']=combine['Last'].str.upper()

#Get rid of , and . which may be inconsistent based on sample
combine['Last']=combine['Last'].str.replace('.','')
combine['Last']=combine['Last'].str.replace(',','')
combine['FirstNm']=combine['FirstNm'].str.replace('.','')
combine['FirstNm']=combine['FirstNm'].str.replace(',','')

#Add preceeding , to common suffixes for parsing out
combine['Last']=combine['Last'].str.replace('JR',',JR')
combine['Last']=combine['Last'].str.replace('II',',II')
combine['Last']=combine['Last'].str.replace('III',',III')
combine['Last']=combine['Last'].str.replace('IV',',IV')

#parse out the suffix from the last name
combine[['LastNm','Suffix']]=college_career_df['Last'].str.split(",",n=1,expand=True)

#Write to CSV file
# combine.to_csv('combine.csv')