In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import feather

%matplotlib inline

In [4]:

def get_header():
    #Set initial colname values
    header = ['YEAR','PLAYER','GP']
    url = "http://espn.go.com/mens-college-basketball/team/stats/_/id/153/north-carolina-tar-heels"
    page = requests.get(url).text
    soup = BeautifulSoup(page, "html.parser")
    
    #Pull colnames from from 2nd table omitting player name
    table = soup.find_all('table')[1]
    #Skip first two rows
    cols = table.find_all('tr')[2]
    tds = cols.find_all('td')
    #Skip first column containing player name
    header += [c.text for c in tds[1:]]
    
    return header

def player_data_scrape(year):
    header = get_header()
    data = []
    data.append(header)
    if year == 2016:
        url = "http://espn.go.com/mens-college-basketball/team/stats/_/id/153/north-carolina-tar-heels"
    else:
        url = "http://espn.go.com/mens-college-basketball/team/stats/_/id/153/year/{}/north-carolina-tar-heels".format(year)

    page = requests.get(url).text
    soup = BeautifulSoup(page, "html.parser")
    
    #Scrape player-name/games-played from 1st table
    table1 = soup.find_all('table')[0]
    table2 = soup.find_all('table')[1]
    #Skip header row and last row in table
    trs = table1.find_all('tr')[2:-1]
    
    for tr in trs:
        row = []
        player = tr.find_all('td')[0].text.upper()
        gp = tr.find_all('td')[1].text
        row.append(int(year))
        row.append(player)
        row.append(int(gp))
        data.append(row)
    
    tots = table2.find_all('tr')[3:-1]

    for i,tr in enumerate(tots):
        tot_row = [int(c.text) for c in tr.find_all('td')[1:]]
        data[i+1] += tot_row
      
    return data


In [5]:
#Scrape our data for years 2003-2016
unc = []
for year in range(2003,2016):
    print("Getting data for year: {}".format(str(year)))
    data = player_data_scrape(year)
    unc.append(data)
    
print("Done!")
len(unc)

Getting data for year: 2003
Getting data for year: 2004
Getting data for year: 2005
Getting data for year: 2006
Getting data for year: 2007
Getting data for year: 2008
Getting data for year: 2009
Getting data for year: 2010
Getting data for year: 2011
Getting data for year: 2012
Getting data for year: 2013
Getting data for year: 2014
Getting data for year: 2015
Done!


13

In [6]:
#Combine our data into a single Pandas dataframe
df = pd.concat(pd.DataFrame(year[1:]) for year in unc)
#Write our dataframe header
df.columns = unc[0][0]
df.set_index(['YEAR','PLAYER'])
df.head()


Unnamed: 0,YEAR,PLAYER,GP,MIN,FGM,FGA,FTM,FTA,3PM,3PA,PTS,OFFR,DEFR,REB,AST,TO,STL,BLK
0,2003,RASHAD MCCANTS,35,1046,215,438,92,132,72,174,594,58,104,162,51,81,49,19
1,2003,JAWAD WILLIAMS,34,1137,182,406,97,129,45,136,506,54,137,191,67,60,35,37
2,2003,RAYMOND FELTON,35,1240,151,379,79,114,69,193,450,29,114,143,235,130,56,9
3,2003,SEAN MAY,11,308,51,108,23,40,0,4,125,25,64,89,11,21,17,20
4,2003,JACKIE MANUEL,35,877,85,202,65,106,19,72,254,52,87,139,71,87,46,28


In [7]:
#Confirm dataframe dimensions
df.shape

#Serialize our dataframe to Feather object for future analysis
feather_path = "uncb_data_2003_2016.feather"
feather.write_dataframe(df, feather_path)
print("Feather file successfully written!")

#Confirm
df2 = feather.read_dataframe(feather_path)
df2.shape


Feather file successfully written!


(205, 18)