
### NFL Player data Web Scrappy:
2018 Season: match to Madden 19: https://maddenratings.weebly.com/madden-nfl-19.html <br>
2019 Season: match to Madden 20: https://maddenratings.weebly.com/madden-nfl-20.html <br>
2020 Season: match to Madden 21: https://maddenratings.weebly.com/madden-nfl-21.html <br>
2021 Season: match to Madden 22: https://maddenratings.weebly.com/madden-nfl-22.html (final roster in one excel table)
<br>


In [1]:
"""
For 2018, 2019 and 2020 data, find the link to players'rating excel table and download them
"""

#import packages
import requests
import json
import re
from bs4 import BeautifulSoup
from lxml import html

import numpy as np
import pandas as pd
import wget
import os

In [2]:
#try 2018 data first
r = requests.get("https://maddenratings.weebly.com/madden-nfl-19.html")
sp = BeautifulSoup(r.text, 'html.parser')

print(sp.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   Madden NFL 19
  </title>
  <meta content="Madden Ratings" property="og:site_name"/>
  <meta content="Madden NFL 19" property="og:title"/>
  <meta content="Release date: August 10th, 2018 ( Based on the 2018 NFL season.) Cover athlete: Antonio Brow WR Pittsburgh Steelers, Terrell Owens WR Dallas Cowboys (PS4, Xbox One, Windows PC) This game marks the..." property="og:description"/>
  <meta content="https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/editor/__438203341.gif" property="og:image"/>
  <meta content="https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/editor/__526649334.gif" property="og:image"/>
  <meta content="https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/editor/__926164749.gif" property="og:image"/>
  <meta content="https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/published/403.gif?1619197917" property="og:image"/>
  <meta content="https://maddenratings.weebly.com/uploads/1/4/0/9/140972

In [3]:
list_teams = sp.find_all('tr', attrs={'class': "wsite-multicol-tr"})

all_excel_data = []
prefix = "https://maddenratings.weebly.com"
for record in list_teams:
    list_href = record.find_all('a', href=True)
    
    for lh in list_href:
        lh_href = lh['href']
        
        if ".xlsx" in lh_href and "pro_bowl" not in lh_href and "elites" not in lh_href and "legends" not in lh_href and "full_player" not in lh_href:
            all_excel_data.append(f"{prefix}{lh_href}")
            
print(f"The total number of xlsx urls are {len(all_excel_data)}, listed below:\n{all_excel_data}")

#make sure the lenght of array equal to total number of NFL teams (32)
assert len(all_excel_data) == 32


The total number of xlsx urls are 32, listed below:
['https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/pittsburgh_steelers__madden_nfl_19_.xlsx', 'https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/baltimore_ravens__madden_nfl_19_.xlsx', 'https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/cleveland_browns__madden_nfl_19_.xlsx', 'https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/cincinnati_bengals__madden_nfl_19_.xlsx', 'https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/new_england_patriots__madden_nfl_19_.xlsx', 'https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/buffalo_bills__madden_nfl_19_.xlsx', 'https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/miami_dolphins__madden_nfl_19_.xlsx', 'https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/new_york_jets__madden_nfl_19_.xlsx', 'https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/houston_texans__madden_nfl_19_.xlsx', 'https://maddenratings.weebly.com/uploads/1/4/0/9/1409729

In [39]:

for url in all_excel_data:
    print(f"Downloanding {url}")
    wget.download(url, out='/home/wenrui/Downloads')
    print("Done")

In [39]:
list_of_all_32_teams = ['pittsburgh_steelers',
 'baltimore_ravens',
 'cleveland_browns',
 'cincinnati_bengals',
 'new_england_patriots',
 'buffalo_bills',
 'miami_dolphins',
 'new_york_jets',
 'houston_texans',
 'jacksonville_jaguars',
 'tennessee_titans',
 'indianapolis_colts',
 'los_angeles_chargers',
 'kansas_city_chiefs',
 'denver_broncos',
 'oakland_raiders',
 'minnesota_vikings',
 'green_bay_packers',
 'chicago_bears',
 'detroit_lions',
 'philadelphia_eagles',
 'dallas_cowboys',
 'washington_redskins',
 'new_york_giants',
 'new_orleans_saints',
 'atlanta_falcons',
 'carolina_panthers',
 'tampa_bay_buccaneers',
 'los_angeles_rams',
 'seattle_seahawks',
 'san_francisco_49ers',
 'arizona_cardinals',
 'washington_football_team',
 'las_vegas_raiders']

#### Function to extract 18, 19, 20 data

In [45]:
def download_18_20_data(year, prefix = "https://maddenratings.weebly.com", all_teams=list_of_all_32_teams):
    
    """
    extract 18, 19, 20 data from maddenrating.weebly.com
    list of all 32 NFL teams
    """
    
    #create the download data folder
    directory = f"./20{year}_season_all_32_teams_data"
    print(directory)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    
    r = requests.get(f"https://maddenratings.weebly.com/madden-nfl-{year+1}.html")
    sp = BeautifulSoup(r.text, 'html.parser')
    
    
    
    list_teams = sp.find_all('tr', attrs={'class': "wsite-multicol-tr"})

    all_excel_data = []
    
    for record in list_teams:
        list_href = record.find_all('a', href=True)

        for lh in list_href:
            lh_href = lh['href']
            
            
            if lh_href.split("/")[-1].split("__")[0] in all_teams: 

            
                all_excel_data.append(f"{prefix}{lh_href}")

    #print(f"The total number of xlsx urls are {len(all_excel_data)}, listed below:\n{all_excel_data}")

    #make sure the lenght of array equal to total number of NFL teams (32)
    print(len(all_excel_data))
    assert len(all_excel_data) == 32
    
    
    
    for url in all_excel_data:
        print(f"Downloanding {url}\n")
        wget.download(url, out=directory)
    print(f"\nFinished downloading 20{year} Season data!")



#### Downloading 18-20 data

In [46]:
download_18_20_data(18)
download_18_20_data(19)
download_18_20_data(20)
#download_18_20_data(21)

./2018_season_all_32_teams_data
32
Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/pittsburgh_steelers__madden_nfl_19_.xlsx

100% [..........................................................] 38865 / 38865Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/baltimore_ravens__madden_nfl_19_.xlsx

100% [..........................................................] 38962 / 38962Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/cleveland_browns__madden_nfl_19_.xlsx

100% [..........................................................] 38887 / 38887Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/cincinnati_bengals__madden_nfl_19_.xlsx

100% [..........................................................] 38918 / 38918Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/new_england_patriots__madden_nfl_19_.xlsx

100% [..........................................................] 38874 / 38874Downloandin

100% [..........................................................] 41083 / 41083Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/los_angeles_chargers__madden_nfl_20_.xlsx

100% [..........................................................] 40690 / 40690Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/kansas_city_chiefs__madden_nfl_20_.xlsx

100% [..........................................................] 41024 / 41024Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/denver_broncos__madden_nfl_20_.xlsx

100% [..........................................................] 41014 / 41014Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/oakland_raiders__madden_nfl_20_.xlsx

100% [..........................................................] 41782 / 41782Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/green_bay_packers__madden_nfl_20_.xlsx

100% [.............................................

100% [..........................................................] 39510 / 39510Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/washington_football_team__madden_nfl_21_.xlsx

100% [..........................................................] 40559 / 40559Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/new_orleans_saints__madden_nfl_21_.xlsx

100% [..........................................................] 40330 / 40330Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/tampa_bay_buccaneers__madden_nfl_21_.xlsx

100% [..........................................................] 39215 / 39215Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/atlanta_falcons__madden_nfl_21_.xlsx

100% [..........................................................] 39526 / 39526Downloanding https://maddenratings.weebly.com/uploads/1/4/0/9/14097292/carolina_panthers__madden_nfl_21_.xlsx

100% [...................................

In [55]:
r2 = requests.get("https://www.ea.com/games/madden-nfl/player-ratings/player-filter/player-iteration/launch-ratings?isLocalized=true")

sp2 = BeautifulSoup(r2.text)

print(sp2.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en" wf-loading="">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <link href="/assets/images/favicon.png" rel="shortcut icon" type="image/png"/>
  <script>
   window.newrelicAttributes = {
            accountId: "1796917",
            trustKey: "1796917",
            agentId: "149134891",
            licenseKey: "f701a6c63c",
            applicationId: "149134891"
        };
  </script>
  <script src="/newrelic.js">
  </script>
  <!-- Compliant Google Optimize -->
  <script>
   (() => {
            const getPrefCookie = () => {
                const prefCookie = document.cookie.match('(^|;)\\s*notice_preferences\\s*=\\s*([^;]+)');

                return prefCookie ? prefCookie.pop() : null;
            };

            const hasOptedIn = (prefCookie) => {
                return prefCookie.match(/1:|2:|100/);
            }

            const appendOptimize = () => {
                const he

In [61]:
sp2.find_all("div", attrs={'class': "madden22_tickerContainer"})

[<div class="madden22_tickerContainer">
 <a href="/games/madden-nfl/madden-nfl-22/buy/" target="_blank">
 <div class="tickerContent">
 </div>
 </a>
 </div>]

In [51]:
sp2

<!DOCTYPE html>
<html dir="ltr" lang="en" wf-loading="">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="/assets/images/favicon.png" rel="shortcut icon" type="image/png"/>
<script>
        window.newrelicAttributes = {
            accountId: "1796917",
            trustKey: "1796917",
            agentId: "149134891",
            licenseKey: "f701a6c63c",
            applicationId: "149134891"
        };
    </script>
<script src="/newrelic.js"></script>
<!-- Compliant Google Optimize -->
<script>
        (() => {
            const getPrefCookie = () => {
                const prefCookie = document.cookie.match('(^|;)\\s*notice_preferences\\s*=\\s*([^;]+)');

                return prefCookie ? prefCookie.pop() : null;
            };

            const hasOptedIn = (prefCookie) => {
                return prefCookie.match(/1:|2:|100/);
            }

            const appendOptimize = () => {
                const head = d

### Join player ratings with play by play data

In [35]:
#
pd.set_option('display.max_rows', None)
import pickle
with open("20220327_final_result_NFL_data.pkl", "rb") as f:
    final_res = pickle.load(f)
    
    
print(f"The preprocessed data shape is {final_res.shape}, header is \n{final_res.head().T}")

The preprocessed data shape is (52331, 101), header is 
GameId                       2018090600                                      \
                                      0           1           2           3   
GameDate                     2018-09-06  2018-09-06  2018-09-06  2018-09-06   
Quarter                               1           1           1           1   
YardLine                             20          41          61          94   
SeriesFirstDown                       1           1           1           0   
Yards                                21          20          33           4   
OffenseTeam_ARI                     0.0         0.0         0.0         0.0   
OffenseTeam_ATL                     2.0         1.0         3.0         4.0   
OffenseTeam_BAL                     0.0         0.0         0.0         0.0   
OffenseTeam_BUF                     0.0         0.0         0.0         0.0   
OffenseTeam_CAR                     0.0         0.0         0.0         0.0

### Read players' rating

In [36]:
combined_data_21  = pd.read_excel("madden_nfl_22_final_roster.xlsx")

print(f"2021 players rating data shape is: {combined_data_21 .shape}, the header is:\n{combined_data_21 .head().T}")

assert len(set(combined_data_21.Team)) == 32

2021 players rating data shape is: (2006, 70), the header is:
                                               0                       1  \
Team                                       49ers                   49ers   
FirstName                                   Alex                    Jake   
LastName                                    Mack                 Brendel   
Position                                       C                       C   
JerseyNum                                     50                      64   
OverallRating                                 85                      56   
Archetype                                C_Power                 C_Agile   
SpeedRating                                   62                      71   
AccelerationRating                            78                      82   
StrengthRating                                92                      80   
AgilityRating                                 68                      70   
AwarenessRating           

In [37]:
#combined all teams data for the year 2018, 2019 and 2020 respectively
import glob


def combined_player_ratings(year):

    list_of_xlsx = glob.glob(f"20{year}_season_all_32_teams_data/*xlsx")

    list_of_pd = [pd.read_excel(x) for x in list_of_xlsx]


    combined_data = pd.concat(list_of_pd, axis=0, ignore_index=True)


    #print(f"Combined data shape is: {combined_data.shape}, the header is:\n{combined_data.head().T}")

    assert len(set(combined_data.Team))==32
    
    print(f"Finished combining 20{year} data!")
    
    return(combined_data)

In [38]:
combined_data_18 = combined_player_ratings(18)
combined_data_19 = combined_player_ratings(19)
combined_data_20 = combined_player_ratings(20)

Finished combining 2018 data!
Finished combining 2019 data!
Finished combining 2020 data!


In [39]:
combined_data_18.head()

Unnamed: 0,Team,Jersey #,Name,Age,Position,Overall,Awareness,Agility,Speed,Acceleration,...,Kick Power,Kick Accuracy,Height,Weight,Birthdate,Years Pro,Handedness,College,Total Salary,Signing Bonus
0,Chargers,53,Mike Pouncey,29,C,75,81,72,66,78,...,15,10,77,299,1989-07-24 00:00:00,7,Right,Florida,12500000,2500000
1,Chargers,73,Spencer Pulley,25,C,66,78,61,69,69,...,20,20,76,308,1993-04-04 00:00:00,2,Right,Vanderbilt,1620000,0
2,Chargers,61,Scott Quessenberry,23,C,65,57,66,69,82,...,17,12,76,310,1995-03-23 00:00:00,0,Right,UCLA,2699259,280000
3,Chargers,26,Casey Hayward Jr,28,CB,91,97,89,90,90,...,22,24,71,192,1989-09-09 00:00:00,7,Right,Vanderbilt,29500000,8000000
4,Chargers,22,Jason Verrett,27,CB,86,85,94,92,92,...,30,25,70,188,1991-06-18 00:00:00,4,Right,TCU,9386000,1010000


In [40]:
#get shared columns from all four years of data

#rename columns for 2021 data
import re

combined_data_21.columns = [re.sub( r"([A-Z])", r" \1", re.sub('Rating', '', x))[1:] for x in combined_data_21.columns]

combined_data_21.columns

Index(['Team', 'First Name', 'Last Name', 'Position', 'Jersey Num', 'Overall',
       'Archetype', 'Speed', 'Acceleration', 'Strength', 'Agility',
       'Awareness', 'Catching', 'Carrying', 'Throw Power', 'Kick Power',
       'Kick Accuracy', 'Run Block', 'Pass Block', 'Tackle', 'Break Tackle',
       'Jumping', 'Kick Return', 'Injury', 'Stamina', 'Toughness', 'Trucking',
       'Change Of Direction', 'B C Vision', 'Stiff Arm', 'Spin Move',
       'Juke Move', 'Impact Blocking', 'Run Block Power', 'Run Block Finesse',
       'Pass Block Power', 'Pass Block Finesse', 'Lead Block', 'Break Sack',
       'Throw Under Pressure', 'Power Moves', 'Finesse Moves',
       'Block Shedding', 'Pursuit', 'Play Recognition', 'Man Coverage',
       'Zone Coverage', 'Running Style', 'Spectacular Catch',
       'Catch In Traffic', 'Short Route Running', 'Medium Route Running',
       'Deep Route Running', 'Hit Power', 'Press', 'Release',
       'Throw Accuracy Short', 'Throw Accuracy Mid', 'Throw Accur

In [10]:
from functools import reduce
#from set import intersection
all_shared_columns = reduce(set.intersection(), list(set(combined_data_21.columns), set(combined_data_20.columns), 
                                                 set(combined_data_19.columns), set(combined_data_18.columns) 
                                                    ))

TypeError: descriptor 'intersection' of 'set' object needs an argument

In [64]:
shared_features = combined_data_21.columns & combined_data_20.columns & combined_data_19.columns & combined_data_18.columns
shared_features

  shared_features = combined_data_21.columns & combined_data_20.columns & combined_data_19.columns & combined_data_18.columns


Index(['Team', 'Position', 'Overall', 'Speed', 'Acceleration', 'Strength',
       'Agility', 'Awareness', 'Catching', 'Carrying', 'Throw Power',
       'Kick Power', 'Kick Accuracy', 'Run Block', 'Pass Block', 'Tackle',
       'Break Tackle', 'Jumping', 'Kick Return', 'Injury', 'Stamina',
       'Toughness', 'Trucking', 'Elusiveness', 'Ball Carrier Vision',
       'Stiff Arm', 'Spin Move', 'Juke Move', 'Impact Blocking',
       'Run Block Power', 'Run Block Finesse', 'Pass Block Power',
       'Pass Block Finesse', 'Lead Block', 'Break Sack',
       'Throw Under Pressure', 'Power Moves', 'Finesse Moves',
       'Block Shedding', 'Pursuit', 'Play Recognition', 'Man Coverage',
       'Zone Coverage', 'Spectacular Catch', 'Catch In Traffic',
       'Short Route Running', 'Medium Route Running', 'Deep Route Running',
       'Hit Power', 'Press', 'Release', 'Throw Accuracy Short',
       'Throw Accuracy Mid', 'Throw Accuracy Deep', 'Play Action',
       'Throw On The Run', 'Height', 'Weight

### Based on the results above, do the following:
<b>1. set all BC Vison as Ball Carrier Vision </b><br>
<b>2. set all Catch In Traffic as Catch In Traffic </b><br>
<b>3. set all Catch as Catching</b><br>
<b>4. set all Catch In Traffic as Catch In Traffic </b><br>
<b>5. set all Overall Rating as Overall </b><br>
<b>6. set Pass Block as Pass Block </b><br>
<b>7. set four throw accuracy as Throw Accuracy Deep, Mid, Short and Run </b><br>
<b>8. To be consistent with Elusiveness, change of Direction score applies to offense player only (see: https://www.sportsgamersonline.com/games/football/madden-21-adds-new-rating-that-affects-gameplay/#:~:text=Madden%2021%20decided%20to%20get,the%20difference%20between%20the%20two.) </b>(will be dealt with at the very end)<br> 
<b>9. Mis-spelling in short route runNing </b><br>
<b>10. Experience as Years-Pro </b><br>
<b>11. set Lead Blocking as Lead Block </b><br>
<b>12. set all Player handedness as Handedness </b><br>
<b>13. Throw away signing Bonus, Total Salary, Archetype </b><br>
<b>14. Combine first and last name into full name </b><br>







### Reorganize combine 21 data first

In [42]:
#change file names
combined_data_21 = combined_data_21.rename(columns={"B C Vision": "Ball Carrier Vision", \
                                                    "P L Y R_ H A N D E D N E S S": "Handedness",
                                                   "Change Of Direction": "Elusiveness"})
#combine first and last name into Full Name
combined_data_21['Full Name'] = combined_data_21[['First Name', 'Last Name']].agg(' '.join, axis=1)



In [65]:
set(combined_data_21.columns) - set(shared_features)



{' Signing Bonus',
 ' Total Salary',
 'Archetype',
 'First Name',
 'Jersey Num',
 'Last Name',
 'P L Y R_ B I R T H D A T E',
 'Running Style'}

### Reformat on combined 20 data

In [48]:
#change file names
combined_data_20 = combined_data_20.rename(columns={"Lead Blocking": "Lead Block", \
                                                    "Pass Blocking": "Pass Block",\
                                                    "Run Blocking": "Run Block", \
                                                    "Player Handedness": "Handedness", \
                                                    "Overall Rating": "Overall",
                                                   "Change Of Direction": "Elusiveness"})


In [66]:
set(combined_data_20.columns) - set(shared_features)

{'Archetype',
 'Birthdate',
 'Jersey Number',
 'Running Style',
 'Signing Bonus',
 'Total Salary'}

### Reformat on combined 19 data

In [58]:
combined_data_19 = combined_data_19.rename(columns={"BC Vision": "Ball Carrier Vision", \
                                                    "Catch in Traffic": "Catch In Traffic",\
                                                    "Deep Throw Accruacy": "Throw Accuracy Deep", \
                                                    "Medium Throw Accuracy": "Throw Accuracy Mid", \
                                                    "Short Throw Accuracy": "Throw Accuracy Short",
                                                   "Handed": "Handedness",
                                                   "Name": "Full Name",
                                                   "Experience": "Years Pro"})


In [67]:
set(combined_data_19.columns) - set(shared_features)

{'Birthdate',
 'Conference',
 'Division',
 'Jersey',
 'Signing Bonus',
 'Total Salary'}

In [70]:
combined_data_19.head().T

Unnamed: 0,0,1,2,3,4
Full Name,Mike Pouncey,Cole Toner,Casey Hayward Jr,Desmond King II,Trevor Williams
Team,Los Angeles Chargers,Los Angeles Chargers,Los Angeles Chargers,Los Angeles Chargers,Los Angeles Chargers
Conference,AFC,AFC,AFC,AFC,AFC
Division,West,West,West,West,West
Position,C,C,CB,CB,CB
Overall,82,54,89,86,78
Awareness,86,76,90,89,84
Stamina,82,76,95,97,90
Speed,66,62,89,88,91
Strength,88,78,69,72,62


### Reformat on combined 18 data

In [63]:
combined_data_18 = combined_data_18.rename(columns={"Catch": "Catching", \
                                                    "Catch in Traffic": "Catch In Traffic",\
                                                   "Name": "Full Name",\
                                                   "Short Route Runing": "Short Route Running",
                                                   "Throw on the Run": "Throw On The Run"})

In [68]:
set(combined_data_18.columns) - set(shared_features)

{'Birthdate', 'Jersey #', 'Signing Bonus', 'Total Salary'}

### get shared columns only for each set
### Will do nothing on Elusiveness column, as for 18 and 19 data, defense players also have an "Elusiveness" score, but lower, and this score will be only used for a given year's data
### using one hot encoding on categorical data, ignoring player name, team, birthdate, College and position 
### In other words, only do one-hot encoding on Handedness

In [111]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


from sklearn import base

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

#self define onehot convert that returns a dataframe with proper column names
class onehot_with_cname(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self
    
    def transform(self, X):
        # Return a pandas data frame from X
        oe = OneHotEncoder(sparse=False)

        return_df = oe.fit_transform(X)
        
        return_df = pd.DataFrame(return_df, columns=oe.get_feature_names_out(X.columns))
        
        #drop columns that has nan
        return_df = return_df.loc[:,['_nan' not in x for x in return_df.columns]]
        
        return return_df


#get one hot vector first
oe = onehot_with_cname()

oe.fit_transform(combined_data_18[['Handedness']])

Unnamed: 0,Handedness_Left,Handedness_Right
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
5,0.0,1.0
6,0.0,1.0
7,0.0,1.0
8,0.0,1.0
9,0.0,1.0


In [120]:
#self define onehot convert that returns a dataframe with proper column names
class reformatted_data(BaseEstimator, TransformerMixin):

    def fit(self, X, shared_cols):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        self.X = X.loc[:, shared_cols]
        return self
    
    def transform(self, X):
        # Return a pandas data frame from X
        oe = onehot_with_cname()

        one_hot_df = oe.fit_transform(self.X[['Handedness']])
        
        return_df = pd.concat([self.X.drop(['Handedness'], axis=1), one_hot_df], axis=1)
        
        #drop columns that has nan
        #return_df = return_df.loc[:,['_nan' not in x for x in return_df.columns]]
        
        return return_df

In [122]:
rd = reformatted_data()

rd.fit_transform(combined_data_18, shared_features).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2358,2359,2360,2361,2362,2363,2364,2365,2366,2367
Team,Chargers,Chargers,Chargers,Chargers,Chargers,Chargers,Chargers,Chargers,Chargers,Chargers,...,Bears,Bears,Bears,Bears,Bears,Bears,Bears,Bears,Bears,Bears
Position,C,C,C,CB,CB,CB,CB,CB,CB,CB,...,WR,WR,WR,WR,WR,WR,WR,WR,WR,WR
Overall,75,66,65,91,86,83,81,73,67,66,...,87,81,74,73,73,72,70,68,61,59
Speed,66,69,69,90,92,91,87,91,92,87,...,90,94,89,89,93,90,87,88,87,85
Acceleration,78,69,82,90,92,90,90,92,89,89,...,93,95,90,89,91,89,88,89,87,89
Strength,88,83,82,69,63,62,72,72,63,66,...,67,60,68,63,74,70,57,66,71,54
Agility,72,61,66,89,94,89,93,90,88,86,...,89,94,93,86,88,86,87,84,84,91
Awareness,81,78,57,97,85,87,83,59,57,54,...,90,85,63,80,68,76,66,57,58,53
Catching,35,32,27,80,77,67,76,70,62,63,...,89,85,80,82,81,81,80,83,79,82
Carrying,32,27,24,55,59,53,74,61,58,59,...,84,73,74,75,69,69,71,75,69,69


### Convert final data sets

In [126]:
final_18_data = rd.fit_transform(combined_data_18, shared_features)
final_19_data = rd.fit_transform(combined_data_19, shared_features)
final_20_data = rd.fit_transform(combined_data_19, shared_features)
final_21_data = rd.fit_transform(combined_data_19, shared_features)

### Make sure converted data is correct

In [131]:
assert sum([final_18_data.columns[i] != final_19_data.columns[i] for i in range(final_19_data.shape[1])])==0
assert sum([final_19_data.columns[i] != final_20_data.columns[i] for i in range(final_20_data.shape[1])])==0
assert sum([final_20_data.columns[i] != final_21_data.columns[i] for i in range(final_21_data.shape[1])])==0
assert final_18_data.shape[1] == final_19_data.shape[1]
assert final_19_data.shape[1] == final_20_data.shape[1]
assert final_20_data.shape[1] == final_21_data.shape[1]

In [133]:
final_20_data.head().T

Unnamed: 0,0,1,2,3,4
Team,Los Angeles Chargers,Los Angeles Chargers,Los Angeles Chargers,Los Angeles Chargers,Los Angeles Chargers
Position,C,C,CB,CB,CB
Overall,82,54,89,86,78
Speed,66,62,89,88,91
Acceleration,78,78,91,90,90
Strength,88,78,69,72,62
Agility,69,55,92,94,89
Awareness,86,76,90,89,84
Catching,35,42,78,76,67
Carrying,32,64,55,74,53


### Remap all the team names to keep team code consistent across all datasets

In [None]:
team_name_mapping = {'ARI': ['Cardinals', 'Arizona Cardinals'], 'ATL': ['Falcons', 'Atlanta Falcons'],\
                     'BAL': ['Ravens', 'Baltimore Ravens'], 'BUF': ['Bills', 'Buffalo Bills'],\
                     'CAR': ['Panthers', 'Carolina Panthers'], 'CHI': ['Bears', 'Chicago Bears'],\
                     'CIN': ['Bengals', 'Cincinnati Bengals'], 'CLE': ['Browns', 'Cleveland Browns'],\
                     'DAL': ['Cowboys', 'Dallas Cowboys'], 'DEN': ['Broncos', 'Denver Broncos'],\
                     'DET': ['Lions', 'Detroit Lions'], 'GB': ['Packers', 'Green Bay Packers'],\
                     'HOU': ['Texans', 'Houston Texans'], 'IND': ['Colts', 'Indianapolis Colts'],\
                     'JAX': ['Jaguars', 'Jacksonville Jaguars'], 'KC': ['Chiefs', 'Kansas City Chiefs'],\
                     'LA': ['Rams', 'Los Angeles Rams'], 'LAC': ['Chargers', 'Los Angeles Chargers'],\
                     'LV': ['Raiders', 'Oakland Raiders'], 'MIA': ['Dolphins', 'Miami Dolphins'],\
                     'MIN': ['Vikings', 'Minnesota Vikings'], 'NE': ['Patriots', 'New England Patriots'],\
                     'NO': ['Saints', 'New Orleans Saints'], 'NYG': ['Giants', 'New York Giants'],\
                     'NYJ': ['Jets', 'New York Jets'], 'PHI': ['Eagles', 'Philadelphia Eagles'],\
                     'PIT': ['Steelers', 'Pittsburgh Steelers'], 'SEA': ['Seahawks', 'Seattle Seahawks'],\
                     'SF': ['49ers', 'San Francisco 49ers'], 'TB': ['Buccaneers', 'Tampa Bay Buccaneers'],\
                     'TEN': ['Titans', 'Tennessee Titans'], 'WAS': ['Redskins', 'Washington Redskins']
                    }

print(len(team_name_mapping))

In [None]:
#rename team to three letter codes
def team_code_convertor(data, team_name_mapping=team_name_mapping):
    
    list_team_codes = []

    for x in data.Team:
        for key, value in team_name_mapping.items():
            if x in value:
                list_team_codes.append(key)
                break
    return list_team_codes


final_18_data['Team'] = team_code_convertor(final_18_data)


team_names = final_18_data.Team.unique()

print(f"total {len(team_names)} team names:\n {team_names}")

In [None]:


final_19_data['Team'] = team_code_convertor(final_19_data)

team_names = final_19_data.Team.unique()

print(f"total {len(team_names)} team names:\n {team_names}")

### Save all the final data

In [134]:
with open ("20220423_players_data_finals.pkl", 'wb') as f:
    pickle.dump([final_18_data, final_19_data, final_20_data, final_21_data], f)
print("Finished!")

Finished!
