In [5]:
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import numpy as np
import random as rnd
#from imblearn.pipeline import Pipeline
from collections import Counter

# import functions py file first

In [6]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [7]:
origin_df = pd.read_csv('bdb.csv', low_memory = False)

In [8]:
working_df = origin_df.copy()

In [9]:
# small reference df for analysis

players = working_df[['NflId', 'DisplayName', 'JerseyNumber', 'PlayerBirthDate', 'PlayerCollegeName', 'Position']]

In [10]:
# initial column drops, all others will be dropped at end

working_df.drop(columns=['DisplayName', 'JerseyNumber', 'PlayerBirthDate', 'PlayerCollegeName',
                        'Stadium', 'Location', 'WindSpeed', 'WindDirection', 'GameId', 'Season'], axis= 1, inplace= True)

In [11]:
# fix incorrect names

working_df['PossessionTeam'].replace(to_replace= {'ARZ':'ARI', 'BLT':'BAL',
                                                  'CLV':'CLE', 'HST':'HOU'}, 
                                    inplace= True)

In [12]:
# simmer Team, PossessionTeam, HomeTeamAbbr, and VisitorTeamAbbr to three cohesive columns:
# offensiveTeam, defensiveTeam, and Team

working_df['offensiveTeam'] = working_df['PossessionTeam']
working_df.loc[working_df['PossessionTeam'] == working_df['HomeTeamAbbr'], 'defensiveTeam'] = working_df['VisitorTeamAbbr']
working_df.loc[working_df['PossessionTeam'] == working_df['VisitorTeamAbbr'], 'defensiveTeam'] = working_df['HomeTeamAbbr']

working_df.loc[working_df['Team'] == 'away', 'Team'] = working_df['VisitorTeamAbbr']
working_df.loc[working_df['Team'] == 'home', 'Team'] = working_df['HomeTeamAbbr']

In [None]:
# convert gameclock (time left in quarter) to seconds

working_df['GameClock'] = working_df['GameClock'].apply(GameClockSeconds)

In [None]:
# convert height to inches

working_df['PlayerHeightInches'] = working_df['PlayerHeight'].apply(PlayerHeightInches)

# FINAL CODE BLOCKS ONLY ABOVE!

In [14]:
# simplify Turf classes
turf_list = ['Field Turf', 'Artificial', 'FieldTurf', 'UBU Speed Series-S5-M',
            'A-Turf Titan', 'UBU Sports Speed S5-M', 'FieldTurf360', 'Twenty-Four/Seven Turf',
            'FieldTurf 360', 'Twenty Four/Seven Turf', 'Turf', 'Field turf', 
             'UBU-Speed Series-S5-M', 'Artifical']
grass_list = ['Grass', 'Natural Grass', 'Natural', 'Naturall Grass', 'natural grass', 'grass',
             'Natural grass']
hybrid_list = ['SISGrass', 'DD GrassMaster', '']

working_df.loc[working_df['Turf'].isin(turf_list), 'Turf'] = "turf"
working_df.loc[working_df['Turf'].isin(grass_list), 'Turf'] = "grass"
working_df.loc[working_df['Turf'].isin(hybrid_list), 'Turf'] = "hybrid"

In [13]:
# final column drop

working_df.drop(columns= ['Team', 'PossessionTeam', 'HomeTeamAbbr', 'VisitorTeamAbbr'], 
                axis=1, inplace=True)

In [None]:
# wrangle offensive personnel in 3 steps:
# 1. manually check suspicious groups, i.e. missing QB, having dbs, having excessive OL
# 2. 
# 3.

working_df['OffensePersonnel'].replace(to_replace={'1 RB, 2 TE, 1 WR,1 DL' : '1 QB, 6 OL, 1 RB, 2 TE, 1 WR', 
                                                   '2 QB, 1 RB, 1 TE, 2 WR' : '1 QB, 5 OL, 2 RB, 1 TE, 2 WR',
                                                   '1 RB, 1 TE, 2 WR,1 DL' : '1 QB, 6 OL, 1 RB, 1 TE, 2 WR', 
                                                   '2 QB, 2 RB, 1 TE, 1 WR' : '1 QB, 5 OL, 3 RB, 1 TE, 1 WR', 
                                                   '1 RB, 3 TE, 0 WR,1 DL' : '1 QB, 6 OL, 1 RB, 3 TE, 0 WR',
                                                   '2 QB, 1 RB, 2 TE, 1 WR' : '1 QB, 5 OL, 2 RB, 2 TE, 1 WR', 
                                                   '6 OL, 1 RB, 2 TE, 0 WR,1 LB' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '6 OL, 1 RB, 2 TE, 0 WR,1 DL' : '1 QB, 6 OL, 2 RB, 2 TE, 0 WR', 
                                                   '1 RB, 2 TE, 1 WR,1 LB' : '1 QB, 5 OL, 2 RB, 2 TE, 1 WR', 
                                                   '1 RB, 1 TE, 2 WR,1 DB' : '1 QB, 5 OL, 1 RB, 1 TE, 3 WR', 
                                                   '6 OL, 2 RB, 1 TE, 0 WR,1 DL' : '1 QB, 6 OL, 3 RB, 1 TE, 0 WR', 
                                                   '2 QB, 1 RB, 0 TE, 3 WR' : '1 QB, 5 OL, 2 RB, 0 TE, 3 WR', 
                                                   '2 QB, 6 OL, 1 RB, 1 TE, 1 WR' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR', 
                                                   '6 OL, 1 RB, 1 TE, 1 WR,1 DL' : '1 QB, 6 OL, 2 RB, 1 TE, 1 WR',
                                                   '1 RB, 3 TE, 0 WR,1 LB' : '1 QB, 5 OL, 2 RB, 3 TE, 0 WR'
                                                  }, 
                                    inplace= True)

working_df['RB'] = working_df['OffensePersonnel'].apply(labelRB)
working_df['TE'] = working_df['OffensePersonnel'].apply(labelTE)
working_df['WR'] = working_df['OffensePersonnel'].apply(labelWR)
working_df['QB'] = working_df['OffensePersonnel'].apply(labelQB)
working_df['OL'] = working_df['OffensePersonnel'].apply(labelOL)