In [2]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup



In [3]:
def get_data(pos,week,year, url):
    response = requests.get(url)

    if response.status_code == 200:
        return response.text
    else:
        print("Failed to retrieve the page.")

def create_advanced_stats_report(pos, week, year):
    adv_column_names = ['rank', 'player', 'games', 'att', 'yds', 
                        'y/att', 'ybcon', 'ybon/att', 'rush_yacon', 
                        'yacon/att', 'brktkl', 'tk_loss', 'tk_loss_yds', 'lng_td', 
                        '10_plus_yds', '20_plus_yds', '30_plus_yds', '40_plus_yds', 
                        '50_plus_yds', 'lng', 'rec', 'tgt', 'rz_tgt', 'rec_yacon']
    
    advanced_stats_url = 'https://www.fantasypros.com/nfl/advanced-stats-{}.php?week={}&range=week&year={}'.format(pos,week,year)

    page_content = get_data(pos,week,year, advanced_stats_url)
    soup = BeautifulSoup(page_content, 'html.parser')
    
    table = soup.find('table')
    
    table_data = []
    for row in table.find_all('tr'):
        row_data = [cell.text.strip() for cell in row.find_all('td')]
        table_data.append(row_data)
    
    report_df = pd.DataFrame(table_data)
    report_df.columns = adv_column_names
    report_df = report_df.drop([0, 1], axis=0)
    report_df.columns = [column.lower() for column in report_df.columns]
    report_df['week'] = week
    report_df['year'] = year
   
    return report_df

def create_basic_stats_report(pos, year, week):
    fts_columns = ['rank', 'player','att', 'yds_rush', 'yds_rush/a', 'lg', '20_plus', 'td_rush', 'rec', 'tgt', 'yds_rec', 'yds/rec', 'td_rec', 'rl', 'g', 'fpts', 'fpts/g', 'rost']
    basic_stats_url = 'https://www.fantasypros.com/nfl/stats/{}.php?year={}&week={}&range=week'.format(pos,year,week)

    page_content = get_data(pos, year, week, basic_stats_url)
    soup = BeautifulSoup(page_content, 'html.parser')
    
    table = soup.find('table')
    
    table_data = []
    for row in table.find_all('tr'):
        row_data = [cell.text.strip() for cell in row.find_all('td')]
        table_data.append(row_data)
    
    report_df = pd.DataFrame(table_data)
    report_df.columns = fts_columns
    report_df = report_df.drop([0, 1], axis=0)
    report_df.columns = [column.lower() for column in report_df.columns]
    report_df['week'] = week
    report_df['year'] = year
   
    return report_df

def build_data_set(pos, week, year):
    adv_stats = create_advanced_stats_report(pos,week,year)
    fts_pts = create_basic_stats_report(pos,week,year)
    
    return pd.merge(adv_stats, fts_pts[['rank','fpts', 'fpts/g']], left_on='rank', right_on='rank') 

In [4]:
build_data_set('rb',1,2022)

Unnamed: 0,rank,player,games,att,yds,y/att,ybcon,ybon/att,rush_yacon,yacon/att,...,50_plus_yds,lng,rec,tgt,rz_tgt,rec_yacon,week,year,fpts,fpts/g
0,1,Saquon Barkley (NYG),1,18,164,9.1,46,2.6,118,6.6,...,1,68,6,7,1,0,1,2022,20.1,20.1
1,2,Jonathan Taylor (IND),1,31,161,5.2,117,3.8,44,1.4,...,0,17,4,7,0,3,1,2022,20.0,20.0
2,3,D'Andre Swift (PHI),1,15,144,9.6,72,4.8,72,4.8,...,1,50,3,3,0,25,1,2022,19.5,19.5
3,4,Cordarrelle Patterson (ATL),1,22,120,5.5,75,3.4,45,2.0,...,0,15,3,5,0,4,1,2022,18.6,18.6
4,5,Clyde Edwards-Helaire (KC),1,7,42,6.0,15,2.1,27,3.9,...,0,18,3,3,2,19,1,2022,18.5,18.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,146,Tony Jones Jr. (ARI),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2022,0,0
146,147,Craig Reynolds (DET),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2022,0,0
147,148,Ty Johnson (BUF),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2022,0,0
148,149,Dare Ogunbowale (HOU),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2022,0,0


In [5]:
combined_df = pd.DataFrame()

for i in range(2020, 2023):
    for j in range(1, 19):
        df = build_data_set('rb',j,i)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

In [7]:
combined_df

Unnamed: 0,rank,player,games,att,yds,y/att,ybcon,ybon/att,rush_yacon,yacon/att,...,50_plus_yds,lng,rec,tgt,rz_tgt,rec_yacon,week,year,fpts,fpts/g
0,1,Josh Jacobs (LV),1,25,93,3.7,35,1.4,58,2.3,...,0,14,4,6,0,21,1,2020,20.1,20.1
1,2,Christian McCaffrey (SF),1,23,97,4.2,46,2.0,51,2.2,...,0,15,3,4,0,14,1,2020,20.0,20.0
2,3,Ezekiel Elliott (NE),1,22,96,4.4,38,1.7,58,2.6,...,0,14,3,4,1,9,1,2020,19.5,19.5
3,4,Malcolm Brown (FA),1,18,79,4.4,46,2.6,33,1.8,...,0,12,3,4,1,19,1,2020,18.6,18.6
4,5,Raheem Mostert (MIA),1,15,56,3.7,39,2.6,17,1.1,...,0,14,4,5,0,4,1,2020,18.5,18.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7488,142,Tony Jones Jr. (ARI),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,18,2022,0,0
7489,143,Craig Reynolds (DET),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,18,2022,0,0
7490,144,Marlon Mack (ARI),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,18,2022,0,0
7491,145,Dwayne Washington (DEN),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,18,2022,0,0


In [8]:
player_data = combined_df[(combined_df['player'] == 'Josh Jacobs (LV)') & (combined_df['year'] == 2020)]

In [9]:
player_data.dtypes

rank           object
player         object
games          object
att            object
yds            object
y/att          object
ybcon          object
ybon/att       object
rush_yacon     object
yacon/att      object
brktkl         object
tk_loss        object
tk_loss_yds    object
lng_td         object
10_plus_yds    object
20_plus_yds    object
30_plus_yds    object
40_plus_yds    object
50_plus_yds    object
lng            object
rec            object
tgt            object
rz_tgt         object
rec_yacon      object
week            int64
year            int64
fpts           object
fpts/g         object
dtype: object