In [1]:
#Importando librerías
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import time
from pandas.api.types import is_numeric_dtype

In [2]:
"""
Definiendo variables con las que se va a trabajar en el dataset a evaluar:
- Data de los últimos 10 años
- Para ofensiva y defensiva
- Para pases y acarreos
"""

years = sorted([i for i in range(2011,2020)], reverse = True)
teams = ['offense', 'defense']
categories = ['passing','rushing']

In [3]:
#Definiendo función que crea DataFrame para cada tabla 

def create_df(data, col_names, col_number):
    
    #Dependiendo del DataFrame a crear, variará el número de columnas
    parts = [data[x:x+col_number] for x in range(0, len(data), col_number)]
    df = pd.DataFrame(parts, columns = col_names)
    
    #Asignando la columna 'Team' como índice
    df = df.set_index(['Team'])
    return df

In [4]:
#Nombre de todos los archivos .csv que Pandas guardará con el formato equipo-categoría-año
names = [f"{team}-{category}-{year}" for year in years for category in categories for team in teams]

#Haciendo web scraping de todos los links para obtener los datasets
count = 0
for year in years:
    for category in categories:
        for team in teams:
            
            #Obtenemos el link dependiendo del equipo, categoría y año
            link = f"https://www.nfl.com/stats/team-stats/{team}/{category}/{year}/reg/all"    
            wait = np.random.randint(10)
            html = requests.get(link).content
            time.sleep(wait)
            soup = BeautifulSoup(html, 'lxml')
            table = soup.select('td')
            columns = soup.select('thead')
            
            #Limpiamos el contenido html que BeautifulSoup arroja
            table_strip = [element.text.strip() for element in table]
            columns_strip = [element.text.strip() for element in columns]

            #Limpiamos de nuevo con regex saltos de línea e información repetida
            clean_table = [re.sub(r'\w.*\n+\s*,??', '',table_strip[element]) for element in range(len(table_strip))]
            clean_columns = [re.sub(r'\n+\s*,??', ',',columns_strip[element]) for element in range(len(columns_strip))][0].split(",")
            
            #Número de columnas del DataFrame dependiendo del link
            number = 11 if category == 'rushing' else 16 if team == 'offense' else 15

            #Guardando cada DataFrame con su respectivo nombre
            df = create_df(clean_table,clean_columns,number)
            df['Year'] = year
            df['Category'] = category
            df['Team'] = team
            df.to_csv(f"{names[count]}.csv")
            count += 1


In [6]:
#Obteniendo lista de los últimos 10 campeones
url = 'https://www.topendsports.com/events/super-bowl/winners-list.htm'
html = requests.get(url).content
soup = BeautifulSoup(html, 'lxml')

table = soup.find('table', class_='list').get_text().strip()
clean = [re.sub(r'\n+\s*,??', ',',table)][0].split(",")
columns = clean[0:6]
parts = [clean[x:x+6] for x in range(6, len(clean), 6)]

winners = pd.DataFrame(parts, columns = columns)[:10]
winners['Winner'] = winners['Winner'].replace = [name.rsplit(None, 1)[-1] for name in winners['Winner']]
winners['Opposition'] = winners['Opposition'].replace = [name.rsplit(None, 1)[-1] for name in winners['Opposition']]

# winners['SB Champion'] = 'Yes'
winners = winners[['Winner','Year']]
#,'SB Champion']]
winners = winners.rename(columns={'Winner': 'Team'})
# winners = winners.set_index('Team')
winners['Year'] = [year for year in range(2010,2020)][::-1]
winners

Unnamed: 0,Team,Year
0,Chiefs,2019
1,Patriots,2018
2,Eagles,2017
3,Patriots,2016
4,Broncos,2015
5,Patriots,2014
6,Seahawks,2013
7,Ravens,2012
8,Giants,2011
9,Packers,2010


In [7]:
"""
Obteniendo las estadísticas para la ofensiva y defensiva (pase y carrera) de los campeones en los últimos 10 años
"""

offense_passing, offense_rushing, defense_passing, defense_rushing = ([] for i in range(4))

for team in teams:
    for category in categories:
        for year in years:
            data = pd.read_csv(f"./{team}-{category}-{year}.csv")
            table = pd.merge(data, winners)
            if team == 'offense':
                if category == 'passing':
                    offense_passing.append(table)
                else:
                    offense_rushing.append(table)
            else:
                if category == 'passing':
                    defense_passing.append(table)
                else:
                    defense_rushing.append(table)

final_offense_passing = pd.concat(offense_passing)
final_offense_passing = final_offense_passing.set_index(['Team'])

final_offense_rushing = pd.concat(offense_rushing)
final_offense_rushing = final_offense_rushing.set_index(['Team'])                    

final_defense_passing = pd.concat(defense_passing)
final_defense_passing = final_defense_passing.set_index(['Team'])

final_defense_rushing = pd.concat(defense_rushing)
final_defense_rushing = final_defense_rushing.set_index(['Team'])

names = [f"final-{team}-{category}.csv" for category in categories for team in teams]    

final_offense_passing.to_csv(names[0])
final_defense_passing.to_csv(names[1])
final_offense_rushing.to_csv(names[2])
final_defense_rushing.to_csv(names[3])

In [8]:
"""
Obteniendo el promedio de los demás equipos que no ganaron el SB por año
"""

def mean_str(col):
    if is_numeric_dtype(col):
        return col.mean()
    else:
        return col.unique() if col.nunique() == 1 else 'AVG others'

rest_offense_passing, rest_offense_rushing, rest_defense_passing, rest_defense_rushing = ([] for i in range(4))

for team in teams:
    for category in categories:
        for year in years:
            
            data = pd.read_csv(f"./{team}-{category}-{year}.csv")
            data['Lng'] = data['Lng'].str.replace('T', '').astype('int')
            table = pd.merge(data, winners)
            remove = int(data[data['Team'] == table['Team'][0]].Team.index.values)
            data = data.drop(index= remove)
            data = data.groupby('Year').agg(mean_str).reset_index()
            
            if team == 'offense':
                if category == 'passing':
                    rest_offense_passing.append(data)
                else:
                    rest_offense_rushing.append(data)
            else:
                if category == 'passing':
                    rest_defense_passing.append(data)
                else:
                    rest_defense_rushing.append(data)

others_offense_passing = pd.concat(rest_offense_passing)
others_offense_passing = others_offense_passing.set_index(['Team'])

others_offense_rushing = pd.concat(rest_offense_rushing)
others_offense_rushing = others_offense_rushing.set_index(['Team'])                    

others_defense_passing = pd.concat(rest_defense_passing)
others_defense_passing = others_defense_passing.set_index(['Team'])

others_defense_rushing = pd.concat(rest_defense_rushing)
others_defense_rushing = others_defense_rushing.set_index(['Team'])

names = [f"others-{team}-{category}.csv" for category in categories for team in teams]    

others_offense_passing.to_csv(names[0])
others_defense_passing.to_csv(names[1])
others_offense_rushing.to_csv(names[2])
others_defense_rushing.to_csv(names[3])

In [9]:
"""
Obteniendo el promedio de todos los equipos
"""

def mean_str(col):
    if is_numeric_dtype(col):
        return col.mean()
    else:
        return col.unique() if col.nunique() == 1 else 'AVG others'

all_offense_passing, all_offense_rushing, all_defense_passing, all_defense_rushing = ([] for i in range(4))

for team in teams:
    for category in categories:
        for year in years:
            
            data = pd.read_csv(f"./{team}-{category}-{year}.csv")
            data['Lng'] = data['Lng'].str.replace('T', '').astype('int')
            data = data.groupby('Team').agg(mean_str).reset_index()
            
            if team == 'offense':
                if category == 'passing':
                    all_offense_passing.append(data)
                else:
                    all_offense_rushing.append(data)
            else:
                if category == 'passing':
                    all_defense_passing.append(data)
                else:
                    all_defense_rushing.append(data)

offense_passing = pd.concat(all_offense_passing)
offense_passing = offense_passing.set_index(['Team'])
offense_passing = offense_passing.groupby('Team').mean().round(1)

offense_rushing = pd.concat(all_offense_rushing)
offense_rushing = offense_rushing.set_index(['Team'])                    
offense_rushing = offense_rushing.groupby('Team').mean().round(1)

defense_passing = pd.concat(all_defense_passing)
defense_passing = defense_passing.set_index(['Team'])
defense_passing = defense_passing.groupby('Team').mean().round(1)

defense_rushing = pd.concat(all_defense_rushing)
defense_rushing = defense_rushing.set_index(['Team'])
defense_rushing = defense_rushing.groupby('Team').mean().round(1)

names = [f"all-{team}-{category}.csv" for category in categories for team in teams]    

offense_passing.to_csv(names[0])
defense_passing.to_csv(names[1])
offense_rushing.to_csv(names[2])
defense_rushing.to_csv(names[3])