## Getting and Inspecting Data

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('../data/drive.csv')
df.head()

Unnamed: 0,Game Code,Drive Number,Team Code,Start Period,Start Clock,Start Spot,Start Reason,End Period,End Clock,End Spot,End Reason,Plays,Yards,Time Of Possession,Red Zone Attempt
0,299004720130829,1,299,1,900.0,75,KICKOFF,1,597.0,0,TOUCHDOWN,10,75,303.0,1
1,299004720130829,2,47,1,592.0,79,KICKOFF,1,370.0,0,TOUCHDOWN,10,79,222.0,1
2,299004720130829,3,299,1,370.0,75,KICKOFF,1,230.0,0,TOUCHDOWN,6,75,140.0,0
3,299004720130829,4,47,1,215.0,10,KICKOFF,1,134.0,5,FIELD GOAL,4,5,81.0,1
4,299004720130829,5,299,1,129.0,84,KICKOFF,1,74.0,82,PUNT,3,2,55.0,0


In [4]:
print(df.shape, df.columns, df.dtypes)
df['Game Code'][10:25]
df.describe()

(22212, 15) Index(['Game Code', 'Drive Number', 'Team Code', 'Start Period', 'Start Clock',
       'Start Spot', 'Start Reason', 'End Period', 'End Clock', 'End Spot',
       'End Reason', 'Plays', 'Yards', 'Time Of Possession',
       'Red Zone Attempt'],
      dtype='object') Game Code               int64
Drive Number            int64
Team Code               int64
Start Period            int64
Start Clock           float64
Start Spot              int64
Start Reason           object
End Period              int64
End Clock             float64
End Spot                int64
End Reason             object
Plays                   int64
Yards                   int64
Time Of Possession    float64
Red Zone Attempt        int64
dtype: object


Unnamed: 0,Game Code,Drive Number,Team Code,Start Period,Start Clock,Start Spot,End Period,End Clock,End Spot,Plays,Yards,Time Of Possession,Red Zone Attempt
count,22212.0,22212.0,22212.0,22212.0,22098.0,22212.0,22212.0,22098.0,22212.0,22212.0,22212.0,22098.0,22212.0
mean,443539300000000.0,13.900774,437.350486,2.465199,466.090415,70.038808,2.538133,394.889402,39.229831,5.647263,30.805871,137.158114,0.286872
std,273442900000000.0,8.079195,256.238626,1.119932,279.326128,17.140499,1.119984,274.374073,31.151988,3.336441,29.028121,91.296703,0.452311
min,2047220000000.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,-37.0,0.0,0.0
25%,231066400000000.0,7.0,234.0,1.0,220.0,64.0,2.0,142.0,0.0,3.0,5.0,72.0,0.0
50%,465046600000000.0,14.0,457.0,2.0,465.0,75.0,2.0,387.0,43.0,5.0,22.0,117.0,0.0
75%,671000500000000.0,20.0,670.0,3.0,709.0,80.0,4.0,632.0,67.0,8.0,57.0,186.0,1.0
max,2915005000000000.0,41.0,2915.0,5.0,900.0,100.0,5.0,900.0,100.0,24.0,99.0,730.0,1.0


## Cleaning data

In [5]:
df.isna().sum()

Game Code               0
Drive Number            0
Team Code               0
Start Period            0
Start Clock           114
Start Spot              0
Start Reason            0
End Period              0
End Clock             114
End Spot                0
End Reason              0
Plays                   0
Yards                   0
Time Of Possession    114
Red Zone Attempt        0
dtype: int64

In [6]:
df = df.dropna()
print(df.shape)

(22098, 15)


## Challenge: Counting Values

In [None]:
N = df['End Reason'].count()
df['End Reason'].value_counts()*100./N

## Challenge: Describing Only *Some* Data

In [None]:
td = df[df['End Reason']=='TOUCHDOWN']
td['Time Of Possession'].describe()

## Challenge: The Best Team?

In [None]:
df['Yards Per Play'] = df['Yards']/df['Plays']
teams_grouped = df.groupby(['Team Code'])
teams_grouped_mean = teams_grouped['Yards Per Play'].mean()
teams_grouped_mean.sort_values(ascending=False).head()