In [1]:
# Data tools
import pandas as pd

In [2]:
# Plotting tools
import plotly.graph_objs as go
import plotly.offline as pyoff
import cufflinks as cf

In [3]:
pyoff.init_notebook_mode(connected=True)

# Read Data

In [4]:
df = pd.read_csv("../data/free_throws.csv")

In [5]:
df.dtypes

end_result     object
game           object
game_id       float64
period        float64
play           object
player         object
playoffs       object
score          object
season         object
shot_made       int64
time           object
dtype: object

In [6]:
df

Unnamed: 0,end_result,game,game_id,period,play,player,playoffs,score,season,shot_made,time
0,106 - 114,PHX - LAL,261031013.0,1.0,Andrew Bynum makes free throw 1 of 2,Andrew Bynum,regular,0 - 1,2006 - 2007,1,11:45
1,106 - 114,PHX - LAL,261031013.0,1.0,Andrew Bynum makes free throw 2 of 2,Andrew Bynum,regular,0 - 2,2006 - 2007,1,11:45
2,106 - 114,PHX - LAL,261031013.0,1.0,Andrew Bynum makes free throw 1 of 2,Andrew Bynum,regular,18 - 12,2006 - 2007,1,7:26
3,106 - 114,PHX - LAL,261031013.0,1.0,Andrew Bynum misses free throw 2 of 2,Andrew Bynum,regular,18 - 12,2006 - 2007,0,7:26
4,106 - 114,PHX - LAL,261031013.0,1.0,Shawn Marion makes free throw 1 of 1,Shawn Marion,regular,21 - 12,2006 - 2007,1,7:18
5,106 - 114,PHX - LAL,261031013.0,1.0,Amare Stoudemire makes free throw 1 of 2,Amare Stoudemire,regular,33 - 20,2006 - 2007,1,3:15
6,106 - 114,PHX - LAL,261031013.0,1.0,Amare Stoudemire makes free throw 2 of 2,Amare Stoudemire,regular,34 - 20,2006 - 2007,1,3:15
7,106 - 114,PHX - LAL,261031013.0,2.0,Leandro Barbosa misses free throw 1 of 2,Leandro Barbosa,regular,43 - 29,2006 - 2007,0,10:52
8,106 - 114,PHX - LAL,261031013.0,2.0,Leandro Barbosa makes free throw 2 of 2,Leandro Barbosa,regular,44 - 29,2006 - 2007,1,10:52
9,106 - 114,PHX - LAL,261031013.0,2.0,Lamar Odom makes free throw 1 of 2,Lamar Odom,regular,44 - 30,2006 - 2007,1,10:37


## Describe potentially interesting columns

In [7]:
df.player.describe()

count           618019
unique            1098
top       LeBron James
freq              8001
Name: player, dtype: object

In [8]:
df.time.describe()

count     618019
unique       534
top         0:00
freq        2666
Name: time, dtype: object

In [9]:
df.season.describe()

count          618019
unique             10
top       2006 - 2007
freq            67612
Name: season, dtype: object

# Analyse the number of shots made per season

In [65]:
def bar_scatter(inDf, name):
    # Create dataset
    data = [go.Bar(x=inDf.index, y=inDf["sum"].values, name="sum"),
            go.Scatter(x=inDf.index, y=inDf["mean"].values, name="mean", yaxis="y2")]
    # Create graph layout
    layout = dict(
        title=name,
        xaxis=dict(
            title="Season"
        ),
        yaxis=dict(
            title='Sum of shots',
            rangemode="tozero"
        ),
        yaxis2=dict(
            title='Mean of shots made',
            overlaying='y',
            side='right',
            range=[0,1]
        )
    )
    # Combine the two into a figure and display it
    return go.Figure(data=data, layout=layout)

In [10]:
season_shots = df.groupby(["season"])["shot_made"].agg(["mean", "sum"])
season_shots

Unnamed: 0_level_0,mean,sum
season,Unnamed: 1_level_1,Unnamed: 2_level_1
2006 - 2007,0.751952,50841
2007 - 2008,0.755634,49494
2008 - 2009,0.769949,50320
2009 - 2010,0.758487,49153
2010 - 2011,0.764192,49013
2011 - 2012,0.752512,36466
2012 - 2013,0.751696,43881
2013 - 2014,0.756574,47127
2014 - 2015,0.748449,45005
2015 - 2016,0.756075,46389


In [67]:
fig = bar_scatter(season_shots, "Shots per season")
pyoff.iplot(fig)

# Analyse the shots of the 5 most frequent players per season

In [53]:
# Get the top 10 players
top5 = df.player.value_counts().head(5)
top5

LeBron James     8001
Dwight Howard    7728
Kevin Durant     6030
Dwyane Wade      5594
Kobe Bryant      5594
Name: player, dtype: int64

In [83]:
# Get the mean and sum of shots per player per season
top5_stats = df[df.player.isin(top5.index)].groupby(["player", "season"])["shot_made"].agg(["mean", "sum"]).reset_index(level="player")
top5_stats

Unnamed: 0_level_0,player,mean,sum
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006 - 2007,Dwight Howard,0.576102,405
2007 - 2008,Dwight Howard,0.585961,576
2008 - 2009,Dwight Howard,0.601113,648
2009 - 2010,Dwight Howard,0.580348,567
2010 - 2011,Dwight Howard,0.60338,607
2011 - 2012,Dwight Howard,0.491289,282
2012 - 2013,Dwight Howard,0.490092,371
2013 - 2014,Dwight Howard,0.554131,389
2014 - 2015,Dwight Howard,0.483945,211
2015 - 2016,Dwight Howard,0.480469,246


In [84]:
df_dict = {}
for i in top5.index:
    df_dict[i] = top5_stats[top5_stats.player == i].drop(columns=["player"])

In [85]:
for d in df_dict:
    fig = bar_scatter(df_dict.get(d), d)
    pyoff.iplot(fig)