In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
import random
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter(action='ignore', category=FutureWarning)

def get_playoff_data()->pd.DataFrame:
    playoff_winners = {}
    years = [2000+i for i in range(8, 24)]
    for year in years:
        url = f'https://www.pro-football-reference.com/years/{year}/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        tables = []
        for each in comments:
            if 'table' in each:
                try:
                    tables.append(pd.read_html(each)[0])
                except:
                    continue
        time.sleep(random.randint(1, 3))
        table = tables[0]
        highest_level= {}
        for level in ['WildCard', 'Division', 'ConfChamp', 'SuperBowl']:
            highest_level[level] = table[table['Week'] == level]['Loser/tie'].values
        highest_level['SuperBowlWinner'] = table[table['Week'] =='SuperBowl']['Winner/tie'].values
        playoff_winners[year] = highest_level
    return playoff_winners 


playoff_winners_dict = get_playoff_data()
playoff_winners_dict


{2008: {'WildCard': array(['Atlanta Falcons', 'Indianapolis Colts', 'Minnesota Vikings',
         'Miami Dolphins'], dtype=object),
  'Division': array(['Carolina Panthers', 'Tennessee Titans', 'San Diego Chargers',
         'New York Giants'], dtype=object),
  'ConfChamp': array(['Baltimore Ravens', 'Philadelphia Eagles'], dtype=object),
  'SuperBowl': array(['Arizona Cardinals'], dtype=object),
  'SuperBowlWinner': array(['Pittsburgh Steelers'], dtype=object)},
 2009: {'WildCard': array(['Philadelphia Eagles', 'Cincinnati Bengals', 'Green Bay Packers',
         'New England Patriots'], dtype=object),
  'Division': array(['Baltimore Ravens', 'Arizona Cardinals', 'Dallas Cowboys',
         'San Diego Chargers'], dtype=object),
  'ConfChamp': array(['New York Jets', 'Minnesota Vikings'], dtype=object),
  'SuperBowl': array(['Indianapolis Colts'], dtype=object),
  'SuperBowlWinner': array(['New Orleans Saints'], dtype=object)},
 2010: {'WildCard': array(['Indianapolis Colts', 'New Orlean

In [11]:
def get_data()->pd.DataFrame:
    df = pd.read_csv('data.csv')
    df.drop(columns=['wins', 'losses'], inplace=True)
    df['comp_pct'] = df['pass_cmp']/df['pass_att']
    df = df[df['year'] > 2007]
    return df


def put_playoff_column(playoff_winner_dict:dict, df:pd.DataFrame)->pd.DataFrame:
    playoff_winners = []
    for index, row in df.iterrows():
        year = row['year']
        team = row['team']
        if team in playoff_winner_dict[year]['WildCard']:
            playoff_winners.append('WildCard')
        elif team in playoff_winner_dict[year]['Division']:
            playoff_winners.append('Division')
        elif team in playoff_winner_dict[year]['ConfChamp']:
            playoff_winners.append('ConfChamp')
        elif team in playoff_winner_dict[year]['SuperBowl']:
            playoff_winners.append('SuperBowlLoser')
        elif team in playoff_winner_dict[year]['SuperBowlWinner']:
            playoff_winners.append('SuperBowlWinner')
        else:
            playoff_winners.append("NoPlayoffs")
        
    df['playoff'] = playoff_winners
    df['playoff'] = df['playoff'].astype('category')

    return df

def prep_data(playoff_winner_dict:dict )->pd.DataFrame:
    df = get_data()
    df['ties'] = df['ties'].fillna(0)
    to_not_average = ['year', 'team', 'ties', 'win_loss_perc', 'yds_per_play_offense', 'pass_net_yds_per_att', 'rush_yds_per_att','score_pct', 'turnover_pct','g', "comp_pct", "points_diff", "mov" ]
    for col in df.columns:
        if col not in to_not_average:
            df[col] = df[col]/df['g']
    df['mov'] = df['points_diff']/ df['g']
    put_playoff_column(playoff_winner_dict, df)
    return df.drop(columns=['g', 'year', 'team' ])

df = prep_data(playoff_winners_dict)
df_dummies = pd.get_dummies(df['playoff'], drop_first=True, dtype='int64')
df = pd.concat([df, df_dummies], axis=1)
df

Unnamed: 0,win_loss_perc,points,points_opp,points_diff,mov,total_yards,plays_offense,yds_per_play_offense,turnovers,fumbles_lost,...,turnover_pct,exp_pts_tot,ties,comp_pct,playoff,Division,NoPlayoffs,SuperBowlLoser,SuperBowlWinner,WildCard
160,0.688,21.562500,19.812500,28,1.750000,345.562500,60.312500,5.7,0.812500,0.375000,...,6.9,-5.876250,0.0,0.672098,WildCard,0,0,0,0,1
161,0.688,25.625000,19.312500,101,6.312500,365.437500,68.437500,5.3,1.312500,0.625000,...,11.5,-0.818750,0.0,0.634831,NoPlayoffs,0,1,0,0,0
162,0.563,25.312500,22.250000,49,3.062500,331.687500,61.312500,5.4,1.937500,0.500000,...,16.9,-4.489375,0.0,0.655955,NoPlayoffs,0,1,0,0,0
163,0.438,21.000000,21.375000,-6,-0.375000,305.125000,59.750000,5.1,1.875000,0.937500,...,16.5,-11.103750,0.0,0.645094,NoPlayoffs,0,1,0,0,0
164,0.750,21.687500,13.937500,124,7.750000,311.937500,63.437500,4.9,1.562500,0.625000,...,12.6,-12.491250,0.0,0.598814,SuperBowlWinner,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.118,13.882353,24.470588,-180,-10.588235,265.294118,64.470588,4.1,1.176471,0.588235,...,11.1,-7.942941,0.0,0.597270,NoPlayoffs,0,1,0,0,0
668,0.706,28.882353,17.529412,193,11.352941,398.411765,60.235294,6.6,1.058824,0.352941,...,10.1,13.616471,0.0,0.684318,SuperBowlLoser,0,0,1,0,0
669,0.588,23.764706,22.176471,27,1.588235,359.294118,64.352941,5.6,1.058824,0.294118,...,9.2,7.842353,0.0,0.619211,WildCard,0,0,0,0,1
670,0.529,21.411765,23.647059,-38,-2.235294,322.941176,58.529412,5.5,1.000000,0.294118,...,8.4,3.666471,0.0,0.645217,NoPlayoffs,0,1,0,0,0
