In [2]:
import requests
import urllib.request
import time
import pandas as pd

In [5]:
# Setting pandas to display columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
url = 'https://www.pro-football-reference.com/years/2017/games.htm'

In [7]:
df_test = pd.read_html(url)[0]

In [8]:
df_test.head(3)

Unnamed: 0,Week,Day,Date,Time,Winner/tie,Unnamed: 5,Loser/tie,Unnamed: 7,PtsW,PtsL,YdsW,TOW,YdsL,TOL
0,1,Thu,September 7,8:30PM,Kansas City Chiefs,@,New England Patriots,boxscore,42,27,537,1,371,0
1,1,Sun,September 10,1:00PM,Oakland Raiders,@,Tennessee Titans,boxscore,26,16,359,0,350,0
2,1,Sun,September 10,1:00PM,Atlanta Falcons,@,Chicago Bears,boxscore,23,17,372,0,301,0


In [9]:
df_test.dtypes

Week          object
Day           object
Date          object
Time          object
Winner/tie    object
Unnamed: 5    object
Loser/tie     object
Unnamed: 7    object
PtsW          object
PtsL          object
YdsW          object
TOW           object
YdsL          object
TOL           object
dtype: object

In [10]:
# Renaming column that determine location of game
df_test = df_test.rename(columns={"Unnamed: 5": "location"})

In [11]:
df_test['year'] = '2017'

In [12]:
# Pulling list into Excel to create mapping
start_list = list(df_test['Winner/tie'].unique()) + list(df_test['Loser/tie'].unique())

In [13]:
# Creating a de-duped final list (exported to Excel to create mapping dict)
final_list = []
for team in start_list:
    if team in final_list or team in (' nan','Loser/tie','Winner/tie') or str(team) == 'nan':
        pass
    else:
        final_list.append(team)
final_list, len(final_list) # ensuring we got all teams

(['Kansas City Chiefs',
  'Oakland Raiders',
  'Atlanta Falcons',
  'Buffalo Bills',
  'Pittsburgh Steelers',
  'Baltimore Ravens',
  'Philadelphia Eagles',
  'Detroit Lions',
  'Jacksonville Jaguars',
  'Los Angeles Rams',
  'Carolina Panthers',
  'Green Bay Packers',
  'Dallas Cowboys',
  'Minnesota Vikings',
  'Denver Broncos',
  'Houston Texans',
  'Arizona Cardinals',
  'New England Patriots',
  'Tampa Bay Buccaneers',
  'Tennessee Titans',
  'Miami Dolphins',
  'Seattle Seahawks',
  'Washington Redskins',
  'New York Jets',
  'Indianapolis Colts',
  'Chicago Bears',
  'New Orleans Saints',
  'Cincinnati Bengals',
  'Los Angeles Chargers',
  'New York Giants',
  'San Francisco 49ers',
  'Cleveland Browns'],
 32)

In [14]:
df_test['home_team'] = df_test.apply(lambda x : x['Loser/tie'] if x['location'] == '@' else x['Winner/tie'],axis=1)
df_test['away_team'] = df_test.apply(lambda x : x['Winner/tie'] if x['location'] == '@' else x['Loser/tie'],axis=1)

In [15]:
df_test.head()

Unnamed: 0,Week,Day,Date,Time,Winner/tie,location,Loser/tie,Unnamed: 7,PtsW,PtsL,YdsW,TOW,YdsL,TOL,year,home_team,away_team
0,1,Thu,September 7,8:30PM,Kansas City Chiefs,@,New England Patriots,boxscore,42,27,537,1,371,0,2017,New England Patriots,Kansas City Chiefs
1,1,Sun,September 10,1:00PM,Oakland Raiders,@,Tennessee Titans,boxscore,26,16,359,0,350,0,2017,Tennessee Titans,Oakland Raiders
2,1,Sun,September 10,1:00PM,Atlanta Falcons,@,Chicago Bears,boxscore,23,17,372,0,301,0,2017,Chicago Bears,Atlanta Falcons
3,1,Sun,September 10,1:00PM,Buffalo Bills,,New York Jets,boxscore,21,12,408,1,214,2,2017,Buffalo Bills,New York Jets
4,1,Sun,September 10,1:00PM,Pittsburgh Steelers,@,Cleveland Browns,boxscore,21,18,290,1,237,1,2017,Cleveland Browns,Pittsburgh Steelers


In [16]:
team_mapping = {
'Arizona Cardinals':'ARI',
'Atlanta Falcons':'ATL',
'Baltimore Ravens':'BAL',
'Buffalo Bills':'BUF',
'Carolina Panthers':'CAR',
'Chicago Bears':'CHI',
'Cincinnati Bengals':'CIN',
'Cleveland Browns':'CLE',
'Dallas Cowboys':'DAL',
'Denver Broncos':'DEN',
'Detroit Lions':'DET',
'Green Bay Packers':'GB',
'Houston Texans':'HOU',
'Indianapolis Colts':'IND',
'Jacksonville Jaguars':'JAX',
'Kansas City Chiefs':'KC',
'Los Angeles Rams':'LA',
'Los Angeles Chargers':'LAC',
'Miami Dolphins':'MIA',
'Minnesota Vikings':'MIN',
'New England Patriots':'NE',
'New Orleans Saints':'NO',
'New York Giants':'NYG',
'New York Jets':'NYJ',
'Oakland Raiders':'OAK',
'Philadelphia Eagles':'PHI',
'Pittsburgh Steelers':'PIT',
'Seattle Seahawks':'SEA',
'San Francisco 49ers':'SF',
'Tampa Bay Buccaneers':'TB',
'Tennessee Titans':'TEN',
'Washington Redskins':'WAS'
}

In [17]:
df_test = df_test.replace({'home_team':team_mapping}).replace({'away_team':team_mapping})
df_test.head()

Unnamed: 0,Week,Day,Date,Time,Winner/tie,location,Loser/tie,Unnamed: 7,PtsW,PtsL,YdsW,TOW,YdsL,TOL,year,home_team,away_team
0,1,Thu,September 7,8:30PM,Kansas City Chiefs,@,New England Patriots,boxscore,42,27,537,1,371,0,2017,NE,KC
1,1,Sun,September 10,1:00PM,Oakland Raiders,@,Tennessee Titans,boxscore,26,16,359,0,350,0,2017,TEN,OAK
2,1,Sun,September 10,1:00PM,Atlanta Falcons,@,Chicago Bears,boxscore,23,17,372,0,301,0,2017,CHI,ATL
3,1,Sun,September 10,1:00PM,Buffalo Bills,,New York Jets,boxscore,21,12,408,1,214,2,2017,BUF,NYJ
4,1,Sun,September 10,1:00PM,Pittsburgh Steelers,@,Cleveland Browns,boxscore,21,18,290,1,237,1,2017,CLE,PIT


In [18]:
# Excluding header rows
df_test = df_test[~df_test['Date'].isin(['Date','Playoffs'])]

In [19]:
# Determined need to remove header rows using this code
# list(df_test['Date'].unique())

In [20]:
df_test['Date'] = pd.to_datetime(df_test['Date'] + ', 2017')

In [21]:
df_test.head()

Unnamed: 0,Week,Day,Date,Time,Winner/tie,location,Loser/tie,Unnamed: 7,PtsW,PtsL,YdsW,TOW,YdsL,TOL,year,home_team,away_team
0,1,Thu,2017-09-07,8:30PM,Kansas City Chiefs,@,New England Patriots,boxscore,42,27,537,1,371,0,2017,NE,KC
1,1,Sun,2017-09-10,1:00PM,Oakland Raiders,@,Tennessee Titans,boxscore,26,16,359,0,350,0,2017,TEN,OAK
2,1,Sun,2017-09-10,1:00PM,Atlanta Falcons,@,Chicago Bears,boxscore,23,17,372,0,301,0,2017,CHI,ATL
3,1,Sun,2017-09-10,1:00PM,Buffalo Bills,,New York Jets,boxscore,21,12,408,1,214,2,2017,BUF,NYJ
4,1,Sun,2017-09-10,1:00PM,Pittsburgh Steelers,@,Cleveland Browns,boxscore,21,18,290,1,237,1,2017,CLE,PIT


In [22]:
df_test['Time'] = pd.to_datetime(df_test['Time']).dt.time

In [23]:
df_final = df_test[['Date','Time','home_team','away_team']]
df_final

Unnamed: 0,Date,Time,home_team,away_team
0,2017-09-07,20:30:00,NE,KC
1,2017-09-10,13:00:00,TEN,OAK
2,2017-09-10,13:00:00,CHI,ATL
3,2017-09-10,13:00:00,BUF,NYJ
4,2017-09-10,13:00:00,CLE,PIT
5,2017-09-10,13:00:00,CIN,BAL
6,2017-09-10,13:00:00,WAS,PHI
7,2017-09-10,13:00:00,DET,ARI
8,2017-09-10,13:00:00,HOU,JAX
9,2017-09-10,16:05:00,LA,IND


In [35]:
def game_schedule(year):
    import requests
    import urllib.request
    import time
    import pandas as pd
    
    url = 'https://www.pro-football-reference.com/years/' + year + '/games.htm'
    
    df = pd.read_html(url)[0]
    
    # Renaming column that determine location of game
    df = df.rename(columns={"Unnamed: 5": "location"})
    
    df['year'] = year
    
    df['home_team'] = df.apply(lambda x : x['Loser/tie'] if x['location'] == '@' else x['Winner/tie'],axis=1)
    df['away_team'] = df.apply(lambda x : x['Winner/tie'] if x['location'] == '@' else x['Loser/tie'],axis=1)
    
    team_mapping = {
    'Arizona Cardinals':'ARI',
    'Atlanta Falcons':'ATL',
    'Baltimore Ravens':'BAL',
    'Buffalo Bills':'BUF',
    'Carolina Panthers':'CAR',
    'Chicago Bears':'CHI',
    'Cincinnati Bengals':'CIN',
    'Cleveland Browns':'CLE',
    'Dallas Cowboys':'DAL',
    'Denver Broncos':'DEN',
    'Detroit Lions':'DET',
    'Green Bay Packers':'GB',
    'Houston Texans':'HOU',
    'Indianapolis Colts':'IND',
    'Jacksonville Jaguars':'JAX',
    'Kansas City Chiefs':'KC',
    'Los Angeles Rams':'LA',
    'Los Angeles Chargers':'LAC',
    'Miami Dolphins':'MIA',
    'Minnesota Vikings':'MIN',
    'New England Patriots':'NE',
    'New Orleans Saints':'NO',
    'New York Giants':'NYG',
    'New York Jets':'NYJ',
    'Oakland Raiders':'OAK',
    'Philadelphia Eagles':'PHI',
    'Pittsburgh Steelers':'PIT',
    'Seattle Seahawks':'SEA',
    'San Francisco 49ers':'SF',
    'Tampa Bay Buccaneers':'TB',
    'Tennessee Titans':'TEN',
    'Washington Redskins':'WAS',
    'San Diego Chargers':'LAC',
    'St. Louis Rams':'LA'
    }
    
    # Replacing team names to be consistent with play-by-play data
    df = df.replace({'home_team':team_mapping}).replace({'away_team':team_mapping})
    
    # Removing date headers
    df = df[~df['Date'].isin(['Date','Playoffs'])]
    
    # Casting date object as date
    df['Date'] = pd.to_datetime(df['Date'] + ', ' + year)
    
    return df[['Date','Time','home_team','away_team']]

In [36]:
sch_15 = game_schedule('2015')
sch_16 = game_schedule('2016')
sch_17 = game_schedule('2017')
sch_18 = game_schedule('2018')
sched = sch_15.append([sch_16, sch_17, sch_18])
sched

Unnamed: 0,Date,Time,home_team,away_team
0,2015-09-10,8:40PM,NE,PIT
1,2015-09-13,1:00PM,LA,SEA
2,2015-09-13,1:00PM,CHI,GB
3,2015-09-13,1:02PM,WAS,MIA
4,2015-09-13,1:02PM,HOU,KC
5,2015-09-13,1:03PM,JAX,CAR
6,2015-09-13,1:03PM,BUF,IND
7,2015-09-13,1:04PM,NYJ,CLE
8,2015-09-13,4:05PM,LAC,DET
9,2015-09-13,4:05PM,ARI,NO


In [None]:
url = 'https://www.pro-football-reference.com/years/2017/games.htm'
    
df = pd.read_html(url)[0]

df

Unnamed: 0,Week,Day,Date,Time,Winner/tie,Unnamed: 5,Loser/tie,Unnamed: 7,PtsW,PtsL,YdsW,TOW,YdsL,TOL
0,1,Thu,September 7,8:30PM,Kansas City Chiefs,@,New England Patriots,boxscore,42,27,537,1,371,0
1,1,Sun,September 10,1:00PM,Oakland Raiders,@,Tennessee Titans,boxscore,26,16,359,0,350,0
2,1,Sun,September 10,1:00PM,Atlanta Falcons,@,Chicago Bears,boxscore,23,17,372,0,301,0
3,1,Sun,September 10,1:00PM,Buffalo Bills,,New York Jets,boxscore,21,12,408,1,214,2
4,1,Sun,September 10,1:00PM,Pittsburgh Steelers,@,Cleveland Browns,boxscore,21,18,290,1,237,1
5,1,Sun,September 10,1:00PM,Baltimore Ravens,@,Cincinnati Bengals,boxscore,20,0,268,1,221,5
6,1,Sun,September 10,1:00PM,Philadelphia Eagles,@,Washington Redskins,boxscore,30,17,356,2,264,4
7,1,Sun,September 10,1:00PM,Detroit Lions,,Arizona Cardinals,boxscore,35,23,367,1,308,4
8,1,Sun,September 10,1:00PM,Jacksonville Jaguars,@,Houston Texans,boxscore,29,7,280,0,203,4
9,1,Sun,September 10,4:05PM,Los Angeles Rams,,Indianapolis Colts,boxscore,46,9,373,1,225,3


In [32]:
ur_row = df.loc[df['Date']=='Playoffs'].index.tolist()
df = df.iloc[:ur_row[0]]
df

Unnamed: 0,Week,Day,Date,Time,Winner/tie,Unnamed: 5,Loser/tie,Unnamed: 7,PtsW,PtsL,YdsW,TOW,YdsL,TOL
0,1,Thu,September 7,8:30PM,Kansas City Chiefs,@,New England Patriots,boxscore,42,27,537,1,371,0
1,1,Sun,September 10,1:00PM,Oakland Raiders,@,Tennessee Titans,boxscore,26,16,359,0,350,0
2,1,Sun,September 10,1:00PM,Atlanta Falcons,@,Chicago Bears,boxscore,23,17,372,0,301,0
3,1,Sun,September 10,1:00PM,Buffalo Bills,,New York Jets,boxscore,21,12,408,1,214,2
4,1,Sun,September 10,1:00PM,Pittsburgh Steelers,@,Cleveland Browns,boxscore,21,18,290,1,237,1
5,1,Sun,September 10,1:00PM,Baltimore Ravens,@,Cincinnati Bengals,boxscore,20,0,268,1,221,5
6,1,Sun,September 10,1:00PM,Philadelphia Eagles,@,Washington Redskins,boxscore,30,17,356,2,264,4
7,1,Sun,September 10,1:00PM,Detroit Lions,,Arizona Cardinals,boxscore,35,23,367,1,308,4
8,1,Sun,September 10,1:00PM,Jacksonville Jaguars,@,Houston Texans,boxscore,29,7,280,0,203,4
9,1,Sun,September 10,4:05PM,Los Angeles Rams,,Indianapolis Colts,boxscore,46,9,373,1,225,3
