In [120]:
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
import matplotlib
%matplotlib inline

In [79]:
def bdate_to_age(date:str) -> int:
    """Calculates the age of a person based on their birth date."""
    today = datetime.datetime.now()
    bdate = datetime.datetime.strptime(date, '%Y-%m-%d')
    return today.year - bdate.year - ((today.month, today.day) < (bdate.month, bdate.day))

In [106]:
def height_in_inches(height:str) -> float:
    """Converts the height to inches. Assumes height is in the format 'ft-in'"""
    if height:
        list_height = [float(i) for i in height.split("-")]
        return (list_height[0] * 12) + list_height[1]
    return height

In [114]:
def age_str_to_float(age:str) -> float:
    """Converts the age to a numeric value. Assumes age is in the format 'age-days'"""
    if age:
        list_age = [float(i) for i in age.split("-")]
        
        if len(list_age) > 1:
            days = (list_age[1]/365)
        else:
            days = 0
            
        return list_age[0] + days
    return age

In [70]:
games = pd.read_json("nfl-football-player-stats/games_1512362753.8735218.json")

In [71]:
profiles = pd.read_json("nfl-football-player-stats/profiles_1512362725.022629.json")

In [73]:
games_played = games.merge(profiles)

In [86]:
#set(profiles.name.unique()) - set(games_played.name.unique())

In [76]:
games_played = games_played.drop('hof_induction_year', axis = 1)

In [80]:
bdate_to_age(games_played.birth_date[1])

51

In [81]:
games_played['current_age'] = games_played.birth_date.apply(bdate_to_age)

In [108]:
games_played.height = games_played.height.apply(height_in_inches)

In [116]:
games_played.age = games_played.age.apply(age_str_to_float)

In [128]:
# Add in a win column as this is the column we are trying 
# to predict for. 
# Need to decide whether we want to calculate a probability or not
games_played['win'] = games_played.player_team_score > games_played.opponent_score

In [129]:
games_played.head()

Unnamed: 0,age,date,defense_interception_touchdowns,defense_interception_yards,defense_interceptions,defense_sacks,defense_safeties,defense_tackle_assists,defense_tackles,field_goal_attempts,...,draft_round,draft_team,draft_year,height,high_school,name,position,weight,current_age,win
0,23.328767,1990-09-09,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,False
1,23.347945,1990-09-16,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,False
2,23.367123,1990-09-23,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,False
3,23.389041,1990-10-01,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,True
4,23.405479,1990-10-07,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,True


In [139]:
# If a player has died, should we exclude that games data
# from the dataset?
# games_played = games_played.loc[games_played.death_date.isna(), :]

Unnamed: 0,age,date,defense_interception_touchdowns,defense_interception_yards,defense_interceptions,defense_sacks,defense_safeties,defense_tackle_assists,defense_tackles,field_goal_attempts,...,draft_round,draft_team,draft_year,height,high_school,name,position,weight,current_age,win
0,23.328767,1990-09-09,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,False
1,23.347945,1990-09-16,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,False
2,23.367123,1990-09-23,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,False
3,23.389041,1990-10-01,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,True
4,23.405479,1990-10-07,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,True
5,23.424658,1990-10-14,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,False
6,23.482192,1990-11-04,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,False
7,23.501370,1990-11-11,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,True
8,23.520548,1990-11-18,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,False
9,23.539726,1990-11-25,0,0,0,0.0,0,0,0,0,...,2.0,Seattle Seahawks,1990.0,72.0,"Van Vleck, TX",Robert Blackmon,DB,208.0,51,True
