In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
# create empty dataframes

stats = pd.DataFrame(columns=['stat', 'unc', 'opponent'])
shots = pd.DataFrame(columns=['stat', 'unc_made', 'unc_att', 'opp_made', 'opp_att'])

In [3]:
# creating a function that takes in a url for the team stats of some game
# appends all the info, category name and team stats, to the dataframes

def get_data(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    table = soup.find('table', {'class':'mod-data'}).find_all('tr', {'class':'highlight'})
    for indent in soup.find('table', {'class':'mod-data'}).find_all('tr', {'class':'indent'}):
        table.append(indent)
    for stat in table:
        clean = stat.text.replace("\n", " ").replace("\t", "").split()
        cat = clean[len(clean)-3::-1][::-1]
        cat = " ".join(cat)
        nums = clean[len(clean)-1:len(clean)-3:-1]        
        unc = nums[0].split('-')
        opp = nums[1].split('-')
        if len(unc) == 2:
            shots.loc[len(shots)] = [cat, unc[0], unc[1], opp[0], opp[1]]
        else:
            stats.loc[len(stats)] = [cat, unc[0], opp[0]]

In [4]:
get_data("http://www.espn.com/mens-college-basketball/matchup?gameId=400949246") # gonzaga

In [5]:
get_data("http://www.espn.com/mens-college-basketball/matchup?gameId=400948853") # oregon

In [6]:
get_data("http://www.espn.com/mens-college-basketball/matchup?gameId=400948726") # kentucky

In [7]:
get_data("http://www.espn.com/mens-college-basketball/matchup?gameId=400947324") # butler

In [8]:
get_data("http://www.espn.com/mens-college-basketball/matchup?gameId=400947185") # arkansas

In [9]:
get_data("http://www.espn.com/mens-college-basketball/matchup?gameId=400946425") # texas southern

In [12]:
# function that will convert the numbers from strings to floats

def to_floats(df):
    for col in df.columns[1:]:
        df[col] = df[col].astype(float)

In [13]:
to_floats(stats)
to_floats(shots)

In [14]:
stats_sum = stats[~stats.stat.str.contains('%')].groupby('stat').sum().reset_index()
stats_sum

Unnamed: 0,stat,unc,opponent
0,Assists,103.0,65.0
1,Blocks,29.0,22.0
2,Defensive Rebounds,177.0,154.0
3,Flagrant Fouls,0.0,0.0
4,Offensive Rebounds,93.0,57.0
5,Personal Fouls,106.0,121.0
6,Steals,47.0,35.0
7,Team Rebounds,0.0,0.0
8,Technical Fouls,0.0,0.0
9,Total Rebounds,270.0,211.0


In [15]:
shots_sum = shots.groupby('stat').sum().reset_index()
shots_sum

Unnamed: 0,stat,unc_made,unc_att,opp_made,opp_att
0,3PT Made-Attempted,37.0,126.0,45.0,140.0
1,FG Made-Attempted,171.0,394.0,144.0,364.0
2,FT Made-Attempted,111.0,163.0,90.0,118.0


In [16]:
shots_sum['unc_pct'] = shots_sum.unc_made / shots_sum.unc_att
shots_sum['opp_pct'] = shots_sum.opp_made / shots_sum.opp_att

In [17]:
shots_sum

Unnamed: 0,stat,unc_made,unc_att,opp_made,opp_att,unc_pct,opp_pct
0,3PT Made-Attempted,37.0,126.0,45.0,140.0,0.293651,0.321429
1,FG Made-Attempted,171.0,394.0,144.0,364.0,0.43401,0.395604
2,FT Made-Attempted,111.0,163.0,90.0,118.0,0.680982,0.762712


In [18]:
# export to csv in order to create visuals in tableau
stats_sum.to_csv('unc_stats.csv')

In [20]:
# calculate per game numbers

stats_sum['unc_pergame'] = stats_sum.unc / 6
stats_sum['opp_pergame'] = stats_sum.opponent / 6
stats_sum

Unnamed: 0,stat,unc,opponent,unc_pergame,opp_pergame
0,Assists,103.0,65.0,17.166667,10.833333
1,Blocks,29.0,22.0,4.833333,3.666667
2,Defensive Rebounds,177.0,154.0,29.5,25.666667
3,Flagrant Fouls,0.0,0.0,0.0,0.0
4,Offensive Rebounds,93.0,57.0,15.5,9.5
5,Personal Fouls,106.0,121.0,17.666667,20.166667
6,Steals,47.0,35.0,7.833333,5.833333
7,Team Rebounds,0.0,0.0,0.0,0.0
8,Technical Fouls,0.0,0.0,0.0,0.0
9,Total Rebounds,270.0,211.0,45.0,35.166667
