In [1]:
## for data
import pandas as pd
import numpy as np
from numpy import cov

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for statistical tests
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm
import statistics
from scipy.stats import pearsonr
import math

## for machine learning
import sklearn
from sklearn import preprocessing, svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import string


Here is my data imported via pandas:

In [2]:
df = pd.read_csv('fdata.csv', encoding = "ISO-8859-1")


#This is the complete list of games from the 2020 NFL season. I am going to clean up the data here in this notebook before 
#reassembling it into a df.

In [3]:
def PointTotals(lst):
    point_totals = []
    for i in lst:
        total_score = int(i[0:2]) + int(i[3:5])
        point_totals.append(total_score)
    return point_totals
    
point_total = PointTotals(df['score'])

#This function adds the point totals together for all of the games, giving us the total score. One flaw with this is that I had
#to go through the excel file and make sure every score had two values i.e. "09" instead of "9"

In [4]:

def CleanUp_TeamNames(lst):
    letters = string.ascii_lowercase
    new_list = []
    
    for word in lst:
        new_word = ''
        for character in word:
            if character.lower() in letters:
                new_word += character
            else:
                break 
        new_list.append(new_word)
   
    return new_list
                        
        
Team1New = CleanUp_TeamNames(df['team 1'])
Team2New = CleanUp_TeamNames(df['team 2'])
             
#This function cleans up the team names so that the data from the website gets trimmed down to just the actual team name


       
        
    

In [5]:
week_list = df['week number']

In [6]:
#in the block below I am going to try and make a function that takes a file name as an input and returns a df with the averages
#for the stats I want by week

In [7]:
def AveragedDataFrame(file_name):
    
    working_df = pd.read_csv(file_name, encoding = "ISO-8859-1")
    
    new_df = pd.DataFrame(index = working_df.loc[1:, 'Week'] )
    
    #this is the function to get the averages for the columns so that each week is matched up with the averages of all the stats
    #from the previous weeks 
    def GetTheAverages(column_name):
        averaged_list = []
        total = 0
        count = 0
        for item in column_name[1:]:
            total += column_name[count]
            count += 1
            average = total/count
            averaged_list.append(average)
        return averaged_list
  
    
    TmList = GetTheAverages(working_df['Tm'])
    OpList = GetTheAverages(working_df['Opp'])
    D1List = GetTheAverages(working_df['D1'])
    TYList = GetTheAverages(working_df['TY'])
    PYList = GetTheAverages(working_df['PY'])
    RYList = GetTheAverages(working_df['RY'])
    D1AList = GetTheAverages(working_df['D1A'])
    TYAList = GetTheAverages(working_df['TYA'])
    PYAList = GetTheAverages(working_df['PYA'])
    RYAList = GetTheAverages(working_df['RYA'])
    OXPList = GetTheAverages(working_df['OXP'])
    DXPList = GetTheAverages(working_df['DXP'])
    STXPList = GetTheAverages(working_df['STXP'])
    NameList = [file_name[0:-8] for i in range(0,len(TmList))] #this creats a series filled with the name of the team, 
                                                                #so that I can reference it in a for loop later on
    
    
    #new_df['Week'] = working_df.loc[1:, 'Week']
    new_df['AveragedTm'] = TmList
    new_df['AveragedOp'] = OpList      
    new_df['AveragedD1'] = D1List       
    new_df['AveragedTY'] = TYList       
    new_df['AveragedPY'] = PYList   
    new_df['AveragedRY'] = RYList
    new_df['AveragedD1A'] = D1AList
    new_df['AveragedTYA'] = TYAList
    new_df['AveragedPYA'] = PYAList
    new_df['AveragedRYA'] = RYAList
    new_df['AveragedOXP'] = OXPList        
    new_df['AveragedDXP'] = DXPList        
    new_df['AveragedSTXP'] = STXPList   
    new_df['Team Name'] = NameList



    return new_df
   
    
    
    
    
    
    

In [21]:
Bills = AveragedDataFrame('bills2020.csv')
Broncos = AveragedDataFrame('broncos2020.csv')
Chargers = AveragedDataFrame('chargers2020.csv')
Raiders = AveragedDataFrame('raiders2020.csv')
Chiefs = AveragedDataFrame('chiefs2020.csv')
Jaguars = AveragedDataFrame('jaguars2020.csv')
Texans = AveragedDataFrame('texans2020.csv')
Colts = AveragedDataFrame('colts2020.csv')
Titans = AveragedDataFrame('titans2020.csv')
Bengals = AveragedDataFrame('bengals2020.csv')
Ravens = AveragedDataFrame('ravens2020.csv')
Browns = AveragedDataFrame('browns2020.csv')
Steelers = AveragedDataFrame('steelers2020.csv')
Patriots = AveragedDataFrame('patriots2020.csv')
Dolphins = AveragedDataFrame('dolphins2020.csv')
Panthers = AveragedDataFrame('panthers2020.csv')
Saints = AveragedDataFrame('saints2020.csv')
Vikings = AveragedDataFrame('vikings2020.csv')
Fortyniners = AveragedDataFrame('fortyniners2020.csv')
Cardinals = AveragedDataFrame('cardinals2020.csv')
Rams = AveragedDataFrame('rams2020.csv')
Seahawks = AveragedDataFrame('seahawks2020.csv')
Falcons = AveragedDataFrame('falcons2020.csv')
Buccaneers = AveragedDataFrame('buccaneers2020.csv')
Lions = AveragedDataFrame('lions2020.csv')
Bears = AveragedDataFrame('bears2020.csv')
Packers = AveragedDataFrame('packers2020.csv')
Eagles = AveragedDataFrame('eagles2020.csv')
Giants = AveragedDataFrame('giants2020.csv')
Redskins = AveragedDataFrame('redskins2020.csv')
Cowboys = AveragedDataFrame('cowboys2020.csv')
Jets = AveragedDataFrame('jets2020.csv')

    


list_of_dataframes = [Bills, Broncos, Chargers, Raiders, Chiefs, Jaguars, Texans, 
                      Colts, Titans, Bengals, Ravens, Browns, Steelers, Patriots, Dolphins, 
                      Panthers, Saints, Vikings, Fortyniners, Cardinals, Rams, Seahawks,
                     Falcons, Buccaneers, Lions, Bears, Packers, Eagles, Giants, Redskins,
                     Cowboys, Jets]

In [22]:
def BuildOut(column_name, team_list):
    lst = [] #the list getting built out that will become the column in the new df
    
    name = column_name[:-3] #This is so the column name from the master_df matches the column names for the individual dfs
    week_row_counter = 0 #this is what I use to ensure that I move on to a consecutive row in the week_list
    
    
    for week_number in week_list: #going through each week first (there are multiples of the same week to account for each game
                                    # that was held during that week, i.e. 2 appears however many times there were games in week 2)
        
        for i in team_list[week_row_counter:]: #this just starts the search through the team list at the corresponding row to the
                                               #that played in that week
            for df in list_of_dataframes:
                
                value = df.at[2, 'Team Name'] #this is just creating a name to match for in the team_list for loop with something
                                              #in the dataframe 
                
                
                if i.lower() == value:
                    
                    adjusted_wvalue = week_number + 2 #had to do this to 
                    
                    number2 = df.index.get_loc(adjusted_wvalue) #supposed to return the index location of the week that we are on
                                                                # in the week_number for loop
                    
                    value2 = df.at[number2, name] #this is using the index location pulled by the the previous line to find the
                                                  #correct week and column for the value I want to add to the list
                    lst.append(value2)
                    
                    week_row_counter += 1 #this adds to the week counter so that the for loop through the team_list knows to skip
                                          #the teams info we just added. If I didnt add this, the whole loop would just add the 
                                          #first team's info over and over
                    
                    
                else:
                    continue
               
                    
                    
               
                break
            break
            
            
                
            
                
            
           
    return lst
          


In [23]:
master_df = pd.DataFrame()

#master_df['Total Scores'] = point_total
#master_df['Team 1'] = Team1New
#master_df['Team 2'] = Team2New
#master_df['Week'] = week_list
master_df['AveragedTm T1'] = BuildOut('AveragedTm T1', Team1New)
master_df['AveragedOp T1'] = BuildOut('AveragedOp T1', Team1New)   
master_df['AveragedD1 T1'] =  BuildOut('AveragedD1 T1', Team1New)     
master_df['AveragedTY T1'] =  BuildOut('AveragedTY T1', Team1New) 
master_df['AveragedPY T1'] =  BuildOut('AveragedPY T1', Team1New)
master_df['AveragedRY T1'] = BuildOut('AveragedRY T1', Team1New)
master_df['AveragedD1A T1'] = BuildOut('AveragedD1A T1', Team1New)
master_df['AveragedTYA T1'] = BuildOut('AveragedTYA T1', Team1New)
master_df['AveragedPYA T1'] = BuildOut('AveragedPYA T1', Team1New)
master_df['AveragedRYA T1'] = BuildOut('AveragedRYA T1', Team1New)
master_df['AveragedOXP T1'] = BuildOut('AveragedOXP T1', Team1New) 
master_df['AveragedDXP T1'] = BuildOut('AveragedDXP T1', Team1New) 
master_df['AveragedSTXP T1'] = BuildOut('AveragedSTXP T1', Team1New) 
#master_df['AveragedTm T2'] = BuildOut('AveragedTm T2', Team2New)
#master_df['AveragedOp T2'] = BuildOut('AveragedOp T2', Team2New)   
#master_df['AveragedD1 T2'] = BuildOut('AveragedD1 T2', Team2New)      
#master_df['AveragedTY T2'] = BuildOut('AveragedTY T2', Team2New) 
#master_df['AveragedPY T2'] = BuildOut('AveragedPY T2', Team2New)
#master_df['AveragedRY T2'] = BuildOut('AveragedRY T2', Team2New)
#master_df['AveragedD1A T2'] = BuildOut('AveragedD1A T2', Team2New)
#master_df['AveragedTYA T2'] = BuildOut('AveragedTYA T2', Team2New)
#master_df['AveragedPYA T2'] = BuildOut('AveragedPYA T2', Team2New)
#master_df['AveragedRYA T2'] = BuildOut('AveragedRYA T2', Team2New)
#master_df['AveragedOXP T2'] = BuildOut('AveragedOXP T2', Team2New)     
#master_df['AveragedDXP T2'] = BuildOut('AveragedDXP T2', Team2New)    
#master_df['AveragedSTXP T2'] = BuildOut('AveragedSTXP T2', Team2New)



KeyError: 4

In [None]:
print(master_df)

4
