### This file (pro_fb_ref_sourcing.ipynb) will:
 + take in the list of all NFL Quarterbacks Drafted from draft_history.csv
 + build links for each player at pro-football_reference
 + load the NFL football data for each player
 + Export:
       + ../sourcing_artifacts/pfb_ref_sourcing_output.pkl, ../sourcing_artifacts/pfb_ref_sourcing_output.csv
       + ../sourcing_artifacts/pfb_ref_sourcing_output_fails.pkl", ../sourcing_artifacts/pfb_ref_sourcing_output_fails.csv"

In [109]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [110]:
# import csv
csv_path = "../sourcing_artifacts/draft_history.csv"
df = pd.read_csv(csv_path)
display(df.head())

Unnamed: 0,Year,No.,Round,Pick,Player,Name,Team,College
0,2024,1,1,1,1,Caleb Williams,Bears,USC
1,2024,2,1,2,2,Jayden Daniels,Commanders,Louisiana State
2,2024,3,1,3,3,Drake Maye,Patriots,North Carolina
3,2024,4,1,8,8,Michael Penix,Falcons,Washington
4,2024,5,1,10,10,J.J. McCarthy,Vikings,Michigan


![image.png](attachment:image.png)




In [111]:
# Convert 'Year' column to numeric, forcing errors to NaN, then drop rows with NaN in 'Year'
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df = df.dropna(subset=['Year'])

# Now filter with the cleaned 'Year' column and select the 'Year' and 'Name' columns
qb_names_to_1969 = df.loc[(df["Year"] > 1900) & (df["Year"] < 1970), ['Year', 'Name']].reset_index(drop=True)
display(len(qb_names_to_1969))
display(qb_names_to_1969.head())

qb_names_1970_9= df.loc[(df["Year"] >= 1970) & (df["Year"] <= 1979), ['Year', 'Name']].reset_index(drop=True)
display(len(qb_names_1970_9))
display(qb_names_1970_9.head())

qb_names_1980_9= df.loc[(df["Year"] >= 1980) & (df["Year"] <= 1989), ['Year', 'Name']].reset_index(drop=True)
display(len(qb_names_1980_9))
display(qb_names_1980_9)

qb_names_1990_9= df.loc[(df["Year"] >= 1990) & (df["Year"] <= 1999), ['Year', 'Name']].reset_index(drop=True)
display(len(qb_names_1990_9))
display(qb_names_1990_9)

qb_names_2000_9= df.loc[(df["Year"] >= 2000) & (df["Year"] <= 2009), ['Year', 'Name']].reset_index(drop=True)
display(len(qb_names_2000_9))
display(qb_names_2000_9)

qb_names_2010_9= df.loc[(df["Year"] >= 2010) & (df["Year"] <= 2019), ['Year', 'Name']].reset_index(drop=True)
display(len(qb_names_2010_9))
display(qb_names_2010_9)

qb_names_2020_= df.loc[(df["Year"] >= 2020), ['Year', 'Name']].reset_index(drop=True)
display(len(qb_names_2020_))
display(qb_names_2020_)


289

Unnamed: 0,Year,Name
0,1969.0,Greg Cook
1,1969.0,Marty Domres
2,1969.0,Terry Hanratty
3,1969.0,Bobby Douglass
4,1969.0,Al Woodall


188

Unnamed: 0,Year,Name
0,1979.0,Jack Thompson
1,1979.0,Phil Simms
2,1979.0,Steve Fuller
3,1979.0,Joe Montana
4,1979.0,Steve Dils


156

Unnamed: 0,Year,Name
0,1989.0,Troy Aikman
1,1989.0,Mike Elkins
2,1989.0,Billy Joe Tolliver
3,1989.0,Anthony Dilweg
4,1989.0,Erik Wilhelm
...,...,...
151,1980.0,Dan Hartwig
152,1980.0,Turk Schonert
153,1980.0,Mike Wright
154,1980.0,Jimmy Jordan


124

Unnamed: 0,Year,Name
0,1999.0,Tim Couch
1,1999.0,Donovan McNabb
2,1999.0,Akili Smith
3,1999.0,Daunte Culpepper
4,1999.0,Cade McNown
...,...,...
119,1990.0,Todd Hammel
120,1990.0,Gene Benhart
121,1990.0,John Gromos
122,1990.0,Major Harris


130

Unnamed: 0,Year,Name
0,2009.0,Matthew Stafford
1,2009.0,Mark Sanchez
2,2009.0,Josh Freeman
3,2009.0,Pat White
4,2009.0,Stephen McGee
...,...,...
125,2000.0,Todd Husak
126,2000.0,JaJuan Seider
127,2000.0,Tim Rattay
128,2000.0,Jarious Jackson


118

Unnamed: 0,Year,Name
0,2019.0,Kyler Murray
1,2019.0,Daniel Jones
2,2019.0,Dwayne Haskins
3,2019.0,Drew Lock
4,2019.0,Will Grier
...,...,...
113,2010.0,Joe Webb
114,2010.0,Tony Pike
115,2010.0,Levi Brown
116,2010.0,Sean Canfield


57

Unnamed: 0,Year,Name
0,2024.0,Caleb Williams
1,2024.0,Jayden Daniels
2,2024.0,Drake Maye
3,2024.0,Michael Penix
4,2024.0,J.J. McCarthy
5,2024.0,Bo Nix
6,2024.0,Spencer Rattler
7,2024.0,Jordan Travis
8,2024.0,Joe Milton
9,2024.0,Devin Leary


In [112]:
import re

def get_weighted_career_av(soup):
    # Locate the div with id 'meta'
    meta_div = soup.find('div', id='meta')
    
    # Initialize variables for storing the AV value and rank
    weighted_av = None
    overall_rank = None
    
    if meta_div:
        # Find text with the "Weighted Career AV" pattern
        av_match = re.search(r'(\d+)\s\((\d+)(?:st|nd|rd|th) overall since 1960\)', meta_div.text)
        
        if av_match:
            weighted_av = av_match.group(1)  # First capture group for the AV number
            overall_rank = av_match.group(2)  # Second capture group for the rank number
    
    if not weighted_av:
        print("\t xx No 'Weighted Career AV' found.")
    else:
        print(f"\t ** Weighted Career AV: {weighted_av}, Overall Rank: {overall_rank}")
    
    return weighted_av, overall_rank


In [113]:
import re
from bs4 import BeautifulSoup

def get_qb_header_data(soup):
    qb_header_data = {
        'throws': '', 'hall_of_fame': '', 'college': '', 'high_school': '',
        'weighted_career_av': '', 'overall_rank': '', 'college_stats': '',
        'draft_team': '', 'draft_round': '', 'draft_pick': '', 'draft_year': ''
    }
    try:
        # Extract Throws
        throws_tag = soup.find('strong', string='Throws:')
        if throws_tag and throws_tag.next_sibling:
            qb_header_data['throws'] = throws_tag.next_sibling.strip()

        # Extract College
        college_tag = soup.find('strong', text='College')
        if college_tag:
            college = college_tag.find_next('a')
            if college:
                qb_header_data['college'] = college.text.strip()

        # Extract College Stats URL
        college_stats_tag = soup.find('a', href=re.compile(r'https://www.sports-reference.com/cfb/players/'))
        if college_stats_tag:
            qb_header_data['college_stats'] = college_stats_tag['href']

        # Extract Weighted Career AV (wAV) and Overall Rank
        # Call to get_weighted_career_av
        weighted_av, overall_rank = get_weighted_career_av(soup)
        if weighted_av and overall_rank:
            qb_header_data['weighted_career_av'] = weighted_av
            qb_header_data['overall_rank'] = overall_rank

        # Extract High School
        high_school_tag = soup.find('strong', text='High School')
        if high_school_tag:
            high_school = high_school_tag.find_next('a')
            if high_school:
                qb_header_data['high_school'] = high_school.text.strip()

        # Extract Hall of Fame year (hof)
        hof_tag = re.search(r'Hall of Fame.*?(\d{4})', str(soup))
        if hof_tag:
            qb_header_data['hall_of_fame'] = hof_tag.group(1)

        # Extract Draft Team, Round, Pick, and Year
        draft_team_tag = re.search(r'Draft.*?<a.*?>(.*?)</a>', str(soup))
        if draft_team_tag:
            qb_header_data['draft_team'] = draft_team_tag.group(1)
        draft_round_tag = re.search(r'in the (\d+)(?:st|nd|rd|th) round', str(soup))
        if draft_round_tag:
            qb_header_data['draft_round'] = draft_round_tag.group(1)
        draft_pick_tag = re.search(r'\((\d+)(?:st|nd|rd|th) overall\)', str(soup))
        if draft_pick_tag:
            qb_header_data['draft_pick'] = draft_pick_tag.group(1)
        draft_year_tag = re.search(r'(\d{4}) NFL Draft', str(soup))
        if draft_year_tag:
            qb_header_data['draft_year'] = draft_year_tag.group(1)

    except AttributeError as e:
        print(f"An error occurred: {e}")
    return qb_header_data


In [114]:
def get_summary_stats(soup):
    summary_stats = {}
    try:
        summary = soup.find('div', {'class': 'stats_pullout'})
        summary_stats['games_played'] = summary.find('span', string='G').find_next('p').text.strip()
        summary_stats['approximate_value'] = summary.find('span', string='AV').find_next('p').text.strip()
        summary_stats['qb_record'] = summary.find('span', string='QBrec').find_next('p').text.strip()
        summary_stats['completion_percentage'] = summary.find('span', string='Cmp%').find_next('p').text.strip()
        summary_stats['passing_yards'] = summary.find('span', string='Yds').find_next('p').text.strip()
        summary_stats['yards_per_attempt'] = summary.find('span', string='Y/A').find_next('p').text.strip()
        summary_stats['touchdowns'] = summary.find('span', string='TD').find_next('p').text.strip()
        summary_stats['interceptions'] = summary.find('span', string='Int').find_next('p').text.strip()
        summary_stats['fantasy_points'] = summary.find('span', string='FantPt').find_next('p').text.strip()
    except AttributeError:
        summary_stats = {key: "N/A" for key in ['games_played', 'approximate_value', 'qb_record', 'completion_percentage', 'passing_yards', 'yards_per_attempt', 'touchdowns', 'interceptions', 'fantasy_points']}
    return summary_stats


def get_recognition(soup):
    recognition_data = {}
    try:
        bling = soup.find('ul', id='bling')
        recognition_data['all_star'] = [li.get_text(strip=True) for li in bling.find_all('li', class_='all_star')]
        recognition_data['uniforms'] = [a['data-tip'] for a in soup.find('div', class_='uni_holder').find_all('a')]
    except AttributeError:
        recognition_data = {key: "N/A" for key in ['all_star', 'uniforms']}
    return recognition_data

In [115]:
import re

def get_faq(soup):
    faq_data = {}
    keyword_patterns = {
        'when_born': re.compile(r'when.*born', re.IGNORECASE),
        'where_born': re.compile(r'where.*born', re.IGNORECASE),
        'height': re.compile(r'tall', re.IGNORECASE),
        'weight': re.compile(r'weigh', re.IGNORECASE),
        'games_played': re.compile(r'games.*play', re.IGNORECASE),
        'pass_yards': re.compile(r'passing.*yards', re.IGNORECASE),
        'TD': re.compile(r'touchdowns', re.IGNORECASE),
        'superbowls': re.compile(r'Super Bowls', re.IGNORECASE),
        'retire_year': re.compile(r'retire', re.IGNORECASE),
        'nickname': re.compile(r'nicknames', re.IGNORECASE)
    }

    try:
        faq = soup.find('div', id='div_faq')
        for key, pattern in keyword_patterns.items():
            question = faq.find('h3', string=pattern)
            if question:
                answer = question.find_next('p').text
                if key in ['when_born', 'where_born']:
                    faq_data[key] = re.split(r'on | in ', answer)[-1].strip('.')
                elif key == 'height':
                    faq_data[key] = re.search(r'\d+-\d+', answer).group()
                elif key == 'weight':
                    faq_data[key] = re.search(r'\d+ lbs', answer).group().split()[0]
                elif key == 'superbowls':
                    faq_data[key] = re.search(r'\d+', answer) and re.search(r'\d+', answer).group() or '0'
                elif key in ['games_played', 'pass_yards', 'TD']:
                    faq_data[key] = re.search(r'\d+', answer.replace(',', '')).group()
                elif key == 'retire_year':
                    faq_data[key] = re.search(r'\d+', answer).group()
                elif key == 'nickname':
                    faq_data[key] = re.split(r' is a nickname| are nicknames', answer)[0]
                else:
                    faq_data[key] = answer
    except AttributeError as e:
        print(f"An error occurred in get_faq: {e}")
    return faq_data

In [116]:
def get_qb_summary_data(soup):
    qb_summary_data = {
        'years': '', 'games': '', 'games_started': '', 'qb_rec': '', 
        'pass_cmp': '', 'pass_att': '', 'pass_cmp_pct': '', 'pass_yds': '',
        'pass_td': '', 'pass_td_pct': '', 'pass_int': '', 'pass_int_pct': '',
        'pass_long': '', 'pass_yds_per_att': '', 'pass_adj_yds_per_att': '',
        'pass_yds_per_cmp': '', 'pass_yds_per_g': '', 'pass_rating': '',
        'pass_sacked': '', 'pass_sacked_yds': '', 'pass_sacked_pct': '',
        'pass_net_yds_per_att': '', 'pass_adj_net_yds_per_att': '',
        'comebacks': '', 'gwd': '', 'av': ''
    }
    
    try:
        # Locate the first row in the table footer
        summary_row = soup.find('tfoot').find('tr')

        # Extract the year information (first cell)
        qb_summary_data['years'] = summary_row.find('th', {'scope': 'row'}).text.strip()

        # Extract statistics from the row
        stats_cells = summary_row.find_all('td')
        qb_summary_data['games'] = stats_cells[1].text.strip()  # G
        qb_summary_data['games_started'] = stats_cells[2].text.strip()  # GS
        qb_summary_data['qb_rec'] = stats_cells[3].text.strip()  # QBrec
        qb_summary_data['pass_cmp'] = stats_cells[4].text.strip()  # Cmp
        qb_summary_data['pass_att'] = stats_cells[5].text.strip()  # Att
        qb_summary_data['pass_cmp_pct'] = stats_cells[6].text.strip()  # Cmp%
        qb_summary_data['pass_yds'] = stats_cells[7].text.strip()  # Yds
        qb_summary_data['pass_td'] = stats_cells[8].text.strip()  # TD
        qb_summary_data['pass_td_pct'] = stats_cells[9].text.strip()  # TD%
        qb_summary_data['pass_int'] = stats_cells[10].text.strip()  # Int
        qb_summary_data['pass_int_pct'] = stats_cells[11].text.strip()  # Int%
        qb_summary_data['pass_long'] = stats_cells[12].text.strip()  # Lng
        qb_summary_data['pass_yds_per_att'] = stats_cells[13].text.strip()  # Y/A
        qb_summary_data['pass_adj_yds_per_att'] = stats_cells[14].text.strip()  # AY/A
        qb_summary_data['pass_yds_per_cmp'] = stats_cells[15].text.strip()  # Y/C
        qb_summary_data['pass_yds_per_g'] = stats_cells[16].text.strip()  # Y/G
        qb_summary_data['pass_rating'] = stats_cells[17].text.strip()  # Rate
        qb_summary_data['pass_sacked'] = stats_cells[18].text.strip()  # Sk
        qb_summary_data['pass_sacked_yds'] = stats_cells[19].text.strip()  # Yds (sacked)
        qb_summary_data['pass_sacked_pct'] = stats_cells[20].text.strip()  # Sk%
        qb_summary_data['pass_net_yds_per_att'] = stats_cells[21].text.strip()  # NY/A
        qb_summary_data['pass_adj_net_yds_per_att'] = stats_cells[22].text.strip()  # ANY/A
        qb_summary_data['comebacks'] = stats_cells[23].text.strip()  # 4QC
        qb_summary_data['gwd'] = stats_cells[24].text.strip()  # GWD
        qb_summary_data['av'] = stats_cells[25].text.strip()  # AV

    except Exception as e:
        print(f"An error occurred while extracting from get_qb_summary_data: {e}")
    
    return qb_summary_data


In [117]:
import re

def create_url(name, page_num=0):
    '''Function to build the following URL from the name passed in:
    name_link = <first letter of last name> / <first four letters of last name><first two letters of first name>
    e.g., Kurt Warner --> W/WarnKu
    https://www.pro-football-reference.com/players/W/WarnKu00.htm
    '''
    
    try:
        # Clean up the name by removing apostrophes and periods
        name = name.replace("'", "").replace(".", "")
        pg_num = str(page_num).zfill(2)  # Format page number as two digits

        # Split name and assign first and last names
        name_parts = name.split()
        if len(name_parts) < 2:
            raise ValueError("Name must contain at least a first and last name")

        first_name = name_parts[0]
        last_name = ''.join(name_parts[1:])
        directory_letter = last_name[0].upper()  # First letter of last name as directory

        if len(last_name) < 4:
            raise ValueError("Last name must have at least 4 characters for URL creation")
        
        # Construct the name link
        name_link = f"{last_name[:4]}{first_name[:2]}"

        # Build the URL
        url = f"https://www.pro-football-reference.com/players/{directory_letter}/{name_link}{pg_num}.htm"
        return url

    except (IndexError, ValueError) as e:
        print(f"Error generating URL for '{name}': {e}")
        return "https://www.pro-football-reference.com/players/Unknown/Unknown00.htm"


In [118]:
# List to store each player's data as a dictionary
all_player_data = []

# Function to merge player data into a dictionary
def merge_player_data_dicts(name, qb_header_data, summary_stats, recognition_data, faq_data):
    player_data = {
        'name': name  # Add the player's name as a column for reference
    }
    player_data.update(qb_header_data)
    player_data.update(summary_stats)
    player_data.update(recognition_data)
    player_data.update(faq_data)
    player_data.update(qb_summary)

    all_player_data.append(player_data)

In [119]:
#  merge Values from 4 dictionaries (qb_header_data, summary_stats, recognition_data, faq_data) into single nested dictionary, 
#  with values flattened under one key (player's name).  (Not as sub-dictionaries).
                                      
# loop through each name, get the values from the sub-dictionaries, and populate 
# an entry in the quarterback_dict with their entry 

import requests
from bs4 import BeautifulSoup
import json

# function fetches a webpage, then checks the HTTP status code. If successful (200), it returns a BeautifulSoup object for parsing. 
#   Otherwise, it returns None and prints an error message. This will allow you to detect and handle failed page requests in your main loop.
def get_soup(url):
    print(f'\tFetching response for: {url}')
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')
    elif response.status_code == 404:
        print(f"\tPage not found: {url}")
        return "not found"
    else:
        print(f"\tFailed to retrieve {url}. Status code: {response.status_code}")
        return None
    
# function attempts to verify if the page content includes the player's name. 
#   It first checks for a <script> tag containing JSON-LD data, which may contain a "name" field. 
#   If not found, it falls back to the <title> tag. 
#   If the player’s name is found in either, it returns True; otherwise, it returns False.
def validate_player_name(soup, player_name):
    json_ld_script = soup.find('script', type='application/ld+json')
    if json_ld_script:
        json_ld = json.loads(json_ld_script.string)
        if 'name' in json_ld and player_name.lower() in json_ld['name'].lower():
            return True
    title_tag = soup.find('title')
    if title_tag and player_name.lower() in title_tag.text.lower():
        return True
    return False

In [120]:
# Sample list of players
quarterback_dict = {}
quarterback_fails = []

qb_names = (qb_names_to_1969['Name'].tolist() 
            + qb_names_1970_9['Name'].tolist()
            + qb_names_1980_9['Name'].tolist()
            + qb_names_1990_9['Name'].tolist()
            + qb_names_2000_9['Name'].tolist()
            + qb_names_2010_9['Name'].tolist()
            + qb_names_2020_['Name'].tolist() )

print(qb_names)
print(len(qb_names))

['Greg Cook', 'Marty Domres', 'Terry Hanratty', 'Bobby Douglass', 'Al Woodall', 'Onree Jackson', 'James Harris', 'Sam Havrilak', 'Larry Good', 'Sonny Wade', 'Bob Naponic', 'Alan Pastrana', 'Brian Dowling', 'Tim Carr', 'Bob Belden', 'Tom Boutwell', 'Ed Roseborough', 'Ed Hargett', "Buster O'Brien", 'Greg Landry', 'Eldridge Dickey', 'Gary Beban', 'Mike Livingston', 'Ken Stabler', 'Gary Davis', 'Billy Stevens', 'Ronnie South', 'Kim Hammond', 'Dewey Warren', 'John Schneider', 'Danny Holman', 'Greg Barton', 'Kim King', 'Jim Alcorn', 'Henry Johnson', 'Dan Darragh', 'Marlin Briscoe', 'Jim Haynie', 'Jeff Beaver', 'Jimmy Raye', 'Bob Lee', 'Steve Spurrier', 'Bob Griese', 'Don Horn', 'Bob David', 'Vidal Carlin', 'Terry Southall', 'Tim Jones', 'Virgil Carter', 'Rick Egloff', 'Corey Colehour', 'Bruce Matte', 'John Foruria', 'Steve Laub', 'Paul Krause', 'Dave Bennett', 'Ben Monroe', 'Jack Lentz', 'Paul Brothers', 'Bill Buckner', 'Bob Biletnikoff', 'Dan Talbott', 'Randy Johnson', 'Rick Norton', 'Gary 

In [121]:
import time

# Iterate over the list of player names
for idx, name in enumerate(qb_names):
    # if idx > 3:  # Limiting the number of players for testing
    #     break
    print(f'\nPlayer ({idx}): {name}')
    valid_player = False

    # Attempt to find a valid page for the player
    for page_num in range(5):  # Check pages 00, 01, and 02
        time.sleep(6 + (page_num * 2))  # Incremental delay to avoid rate-limiting
        url = create_url(name, page_num)
        soup = get_soup(url)

        if soup == "not found":
            # Break out of the loop since we don't want to try other page numbers
            print(f"\tPage not found for {name} at {url}. Skipping further page attempts.")
            quarterback_fails.append(name)
            break

# Validate_player_name checks if page is the correct one. If it matches, data is gathered and added to main list.
        elif soup and validate_player_name(soup, name):
            print(f"** Successfully retrieved data for {name} at URL {page_num}: {url} **")
# Retrieve data sections
            qb_header_data = get_qb_header_data(soup)
            summary_stats = get_summary_stats(soup)
            recognition_data = get_recognition(soup)
            faq_data = get_faq(soup)
            qb_summary = get_qb_summary_data(soup)
# Merge player data and add to list
            merge_player_data_dicts(name, qb_header_data, summary_stats, recognition_data, faq_data)
            valid_player = True
            break
        else:
            print(f"\txx Failed to retrieve valid data for {name} with page number {str(page_num).zfill(2)} xx")
    
    if not valid_player:
        quarterback_fails.append(name)
        print(f"\tNo valid data found for {name} after checking all pages")

# After all players have been processed, pd.DataFrame(all_player_data) creates a single large DataFrame with all player data.
combined_df = pd.DataFrame(all_player_data)

# Display the combined DataFrame and list of failures
# print(combined_df)
print("Pages not found:", quarterback_fails)



Player (0): Greg Cook
Fetching response for: https://www.pro-football-reference.com/players/C/CookGr00.htm
	Successfully retrieved data for Greg Cook at URL 0: https://www.pro-football-reference.com/players/C/CookGr00.htm
Weighted Career AV: 10, Overall Rank: 8386


  college_tag = soup.find('strong', text='College')
  high_school_tag = soup.find('strong', text='High School')


Player (1): Marty Domres
Fetching response for: https://www.pro-football-reference.com/players/D/DomrMa00.htm
	Successfully retrieved data for Marty Domres at URL 0: https://www.pro-football-reference.com/players/D/DomrMa00.htm
Weighted Career AV: 19, Overall Rank: 5811
Player (2): Terry Hanratty
Fetching response for: https://www.pro-football-reference.com/players/H/HanrTe00.htm
	Successfully retrieved data for Terry Hanratty at URL 0: https://www.pro-football-reference.com/players/H/HanrTe00.htm
Weighted Career AV: 8, Overall Rank: 9197
Player (3): Bobby Douglass
Fetching response for: https://www.pro-football-reference.com/players/D/DougBo00.htm
	Successfully retrieved data for Bobby Douglass at URL 0: https://www.pro-football-reference.com/players/D/DougBo00.htm
Weighted Career AV: 37, Overall Rank: 2916
Player (4): Al Woodall
Fetching response for: https://www.pro-football-reference.com/players/W/WoodAl00.htm
	Successfully retrieved data for Al Woodall at URL 0: https://www.pro-fo

In [None]:
display(f'rows : {len(combined_df)}; cols: {len(combined_df.columns)}')
display(combined_df.iloc[:,:15].tail(5))
display(combined_df.iloc[:,15:30].tail(5))
display(combined_df.iloc[:,30:45].tail(5))
display(combined_df.iloc[:,45:].tail(5))


## PAUSE POINT: save dataframe and list to CSV and pickle

In [138]:
#Export csv

import csv

# combined_df     (DataFrame)
csv_path =  "../sourcing_artifacts/pro_fb_ref_output_combinedxx.csv"
combined_df.to_csv(csv_path, index=False)

# quarterback_fails  (list)
list_path = "../sourcing_artifacts/pro_fb_ref_output_failsxx.csv"
# Write the list to the CSV file as a single comma-separated row
with open(list_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(quarterback_fails)  # writerow instead of writerows to save as a single row


In [139]:
# Save as Pickle; pickle.dump to store data

import pickle

# combined_df     (DataFrame)
pkl_path1 = "../sourcing_artifacts/pfb_ref_sourcing_outputxx.pkl"
with open(pkl_path1, 'wb') as file:
    pickle.dump(combined_df, file)


# quarterback_fails  (list)
pkl_path2 = "../sourcing_artifacts/pfb_ref_sourcing_outputxx.pkl"
with open(pkl_path2, 'wb') as file:
    pickle.dump(quarterback_fails, file)

### ** ALL REMAINING CELLS: NOT USED **


## RESTORE POINT: load/restore dataframe and list from CSV and pickle

In [140]:
# Read the CSVs

# combined_df     (DataFrame)
csv_path = "../sourcing_artifacts/pfb_ref_sourcing_output_combined.csv"
combined_df_restore = pd.read_csv(csv_path)

display(f'rows : {len(combined_df_restore)}; cols: {len(combined_df_restore.columns)}')
display(combined_df_restore.head())


# quarterback_fails  (list)
list_path = "../sourcing_artifacts/pfb_ref_sourcing_output_fails.csv"
quarterback_fails_restore = []
# use next(reader) to retrieve the single row as a list, which simplifies the code and keeps quarterback_fails_restore as a flat list of items. 
#   Each item will correspond to an element from the original quarterback_fails list.
with open(list_path, mode='r') as file:
    reader = csv.reader(file)
    quarterback_fails_restore = next(reader)  # Using next() to get the first row as a single list

print(f'length: {len(quarterback_fails_restore)}')
print(f'restored {quarterback_fails_restore}')


'rows : 656; cols: 58'

Unnamed: 0,name,throws,hall_of_fame,college,high_school,weighted_career_av,overall_rank,college_stats,draft_team,draft_round,...,pass_rating,pass_sacked,pass_sacked_yds,pass_sacked_pct,pass_net_yds_per_att,pass_adj_net_yds_per_att,comebacks,gwd,av,nickname
0,Greg Cook,Right,,Cincinnati,Chillicothe,10.0,8386.0,https://www.sports-reference.com/cfb/players/g...,Cincinnati Bengals,1.0,...,87.6,29.0,195.0,12.66,7.29,6.44,1.0,0.0,10.0,
1,Marty Domres,Right,8842.0,Columbia,Christian Brothers Academy,19.0,5811.0,https://www.sports-reference.com/cfb/players/m...,San Diego Chargers,1.0,...,53.8,79.0,588.0,8.9,4.86,2.93,3.0,2.0,21.0,
2,Terry Hanratty,Right,8842.0,Notre Dame,Butler,8.0,9197.0,https://www.sports-reference.com/cfb/players/t...,Pittsburgh Steelers,2.0,...,43.0,34.0,299.0,7.31,4.75,2.4,3.0,2.0,8.0,
3,Bobby Douglass,Left,8842.0,Kansas,El Dorado,37.0,2916.0,https://www.sports-reference.com/cfb/players/b...,Chicago Bears,2.0,...,48.5,180.0,1335.0,13.25,3.8,2.21,3.0,4.0,41.0,
4,Al Woodall,Right,,Duke,Erwin,13.0,7362.0,https://www.sports-reference.com/cfb/players/a...,New York Jets,2.0,...,60.3,60.0,455.0,10.66,4.47,3.27,1.0,1.0,13.0,


length: 806
restored ['Onree Jackson', 'Onree Jackson', 'Larry Good', 'Larry Good', 'Sonny Wade', 'Sonny Wade', 'Alan Pastrana', 'Alan Pastrana', 'Tim Carr', 'Tim Carr', 'Bob Belden', 'Bob Belden', 'Tom Boutwell', 'Tom Boutwell', 'Ed Roseborough', 'Ed Roseborough', 'Ed Hargett', 'Ed Hargett', "Buster O'Brien", "Buster O'Brien", 'Ronnie South', 'Ronnie South', 'John Schneider', 'John Schneider', 'Danny Holman', 'Danny Holman', 'Greg Barton', 'Greg Barton', 'Kim King', 'Kim King', 'Jim Alcorn', 'Jim Alcorn', 'Henry Johnson', 'Henry Johnson', 'Jim Haynie', 'Jim Haynie', 'Jeff Beaver', 'Jeff Beaver', 'Jimmy Raye', 'Jimmy Raye', 'Bob Lee', 'Bob Lee', 'Bob David', 'Bob David', 'Vidal Carlin', 'Vidal Carlin', 'Terry Southall', 'Terry Southall', 'Rick Egloff', 'Rick Egloff', 'Corey Colehour', 'Corey Colehour', 'Bruce Matte', 'Bruce Matte', 'John Foruria', 'John Foruria', 'Steve Laub', 'Steve Laub', 'Dave Bennett', 'Dave Bennett', 'Ben Monroe', 'Ben Monroe', 'Jack Lentz', 'Jack Lentz', 'Paul Br

In [141]:
# Read the PKls

# combined_df     (DataFrame)
pkl_path1 = "../sourcing_artifacts/pfb_ref_sourcing_output_combined.pkl"
with open(pkl_path1, 'rb') as f:
    combined_df_restore = pickle.load(f)

display(f'rows : {len(combined_df_restore)}; cols: {len(combined_df_restore.columns)}')
display(combined_df_restore.head())


# quarterback_fails  (list)
pkl_path2 = "../sourcing_artifacts/pfb_ref_sourcingoutput_fails.pkl"
with open(pkl_path2, 'rb') as f:
    quarterback_fails_restore = pickle.load(f)

display(f'length: {len(quarterback_fails_restore)}')
display(f'restored {quarterback_fails_restore}')

'rows : 656; cols: 58'

Unnamed: 0,name,throws,hall_of_fame,college,high_school,weighted_career_av,overall_rank,college_stats,draft_team,draft_round,...,pass_rating,pass_sacked,pass_sacked_yds,pass_sacked_pct,pass_net_yds_per_att,pass_adj_net_yds_per_att,comebacks,gwd,av,nickname
0,Greg Cook,Right,,Cincinnati,Chillicothe,10,8386,https://www.sports-reference.com/cfb/players/g...,Cincinnati Bengals,1,...,87.6,29,195,12.66,7.29,6.44,1,0,10,
1,Marty Domres,Right,8842.0,Columbia,Christian Brothers Academy,19,5811,https://www.sports-reference.com/cfb/players/m...,San Diego Chargers,1,...,53.8,79,588,8.9,4.86,2.93,3,2,21,
2,Terry Hanratty,Right,8842.0,Notre Dame,Butler,8,9197,https://www.sports-reference.com/cfb/players/t...,Pittsburgh Steelers,2,...,43.0,34,299,7.31,4.75,2.4,3,2,8,
3,Bobby Douglass,Left,8842.0,Kansas,El Dorado,37,2916,https://www.sports-reference.com/cfb/players/b...,Chicago Bears,2,...,48.5,180,1335,13.25,3.8,2.21,3,4,41,
4,Al Woodall,Right,,Duke,Erwin,13,7362,https://www.sports-reference.com/cfb/players/a...,New York Jets,2,...,60.3,60,455,10.66,4.47,3.27,1,1,13,


'length: 806'

'restored [\'Onree Jackson\', \'Onree Jackson\', \'Larry Good\', \'Larry Good\', \'Sonny Wade\', \'Sonny Wade\', \'Alan Pastrana\', \'Alan Pastrana\', \'Tim Carr\', \'Tim Carr\', \'Bob Belden\', \'Bob Belden\', \'Tom Boutwell\', \'Tom Boutwell\', \'Ed Roseborough\', \'Ed Roseborough\', \'Ed Hargett\', \'Ed Hargett\', "Buster O\'Brien", "Buster O\'Brien", \'Ronnie South\', \'Ronnie South\', \'John Schneider\', \'John Schneider\', \'Danny Holman\', \'Danny Holman\', \'Greg Barton\', \'Greg Barton\', \'Kim King\', \'Kim King\', \'Jim Alcorn\', \'Jim Alcorn\', \'Henry Johnson\', \'Henry Johnson\', \'Jim Haynie\', \'Jim Haynie\', \'Jeff Beaver\', \'Jeff Beaver\', \'Jimmy Raye\', \'Jimmy Raye\', \'Bob Lee\', \'Bob Lee\', \'Bob David\', \'Bob David\', \'Vidal Carlin\', \'Vidal Carlin\', \'Terry Southall\', \'Terry Southall\', \'Rick Egloff\', \'Rick Egloff\', \'Corey Colehour\', \'Corey Colehour\', \'Bruce Matte\', \'Bruce Matte\', \'John Foruria\', \'John Foruria\', \'Steve Laub\', \'Steve L

In [123]:
display(combined_df.iloc[:,:15].tail(5))
display(combined_df.iloc[:,15:30].tail(5))
display(combined_df.iloc[:,30:45].tail(5))
display(combined_df.iloc[:,45:].tail(5))
display(len(quarterback_fails))

Unnamed: 0,name,throws,hall_of_fame,college,high_school,weighted_career_av,overall_rank,college_stats,draft_team,draft_round,draft_pick,draft_year,games_played,approximate_value,qb_record
651,Jake Luton,Right,,Idaho,,2.0,13593.0,https://www.sports-reference.com/cfb/players/j...,Jacksonville Jaguars,6,189,2020,3.0,2.0,0-3-0
652,Cole McDonald,,,Hawaii,,,,https://www.sports-reference.com/cfb/players/c...,Tennessee Titans,7,224,2020,,,
653,Ben DiNucci,Right,,Pittsburgh,Pine-Richland,1.0,15158.0,https://www.sports-reference.com/cfb/players/b...,Dallas Cowboys,7,231,2020,3.0,1.0,0-1-0
654,Tommy Stevens,Right,,Penn St.,,0.0,17659.0,https://www.sports-reference.com/cfb/players/t...,New Orleans Saints,7,240,2020,1.0,,
655,Nate Stanley,,,Iowa,,,,https://www.sports-reference.com/cfb/players/n...,Minnesota Vikings,7,244,2020,,,


Unnamed: 0,completion_percentage,passing_yards,yards_per_attempt,touchdowns,interceptions,fantasy_points,all_star,uniforms,when_born,where_born,height,weight,pass_yards,TD,superbowls
651,54.5,624.0,5.7,2.0,6.0,28.3,,,"April 11, 1996","Marysville, WA",6-6,224,624.0,2.0,
652,,,,,,,,,"May 20, 1998",,6-3,215,,,
653,53.5,219.0,5.1,0.0,0.0,7.0,,,"November 24, 1996","Atlanta, GA",6-2,215,219.0,,
654,,,,,,,,,"December 15, 1996","Indianapolis, IN",6-5,235,,,
655,,,,,,,,,"August 26, 1997",,6-4,235,,,


Unnamed: 0,retire_year,years,games,games_started,qb_rec,pass_cmp,pass_att,pass_cmp_pct,pass_yds,pass_td,pass_td_pct,pass_int,pass_int_pct,pass_long,pass_yds_per_att
651,2020.0,1 Yr,3.0,3.0,0-3-0,60.0,110.0,54.5,624.0,2.0,1.8,6.0,5.5,31.0,41.0
652,,,,,,,,,,,,,,,
653,,1 Yr,3.0,1.0,0-1-0,23.0,43.0,53.5,219.0,0.0,0.0,0.0,0.0,11.0,34.0
654,2020.0,1 Yr,1.0,0.0,4,24.0,0.0,2.0,100.0,10.0,6.0,24.0,4.0,0.0,0.0
655,,,,,,,,,,,,,,,


Unnamed: 0,pass_adj_yds_per_att,pass_yds_per_cmp,pass_yds_per_g,pass_rating,pass_sacked,pass_sacked_yds,pass_sacked_pct,pass_net_yds_per_att,pass_adj_net_yds_per_att,comebacks,gwd,av,nickname
651,73.0,5.7,3.58,10.4,208.0,54.5,,7.0,52.0,5.98,4.89,2.92,
652,,,,,,,,,,,,,
653,32.0,5.1,5.09,9.5,73.0,67.9,,7.0,73.0,14.0,2.92,2.92,
654,0.0,0.0,0.0,0.0,0.0,4.0,6.0,24.0,0.0,0.0,0.0,,
655,,,,,,,,,,,,,


806

### Bring in nfl_career_statistics.csv  to get QB names, etc.

In [162]:
# get lists of QBs to get data for:
rel_path = "../Data_Artifacts/nfl_career_statistics.csv"
qb_name_df = pd.read_csv(rel_path)
print(len(qb_name_df))

qb_name_df.head()

# import os
# absolute_path = os.path.abspath(rel_path)
# print(f"Absolute path: {absolute_path}")

# # Check if the file exists and open it with the default application
# if os.path.exists(absolute_path):
#     os.startfile(absolute_path)
#     print('Opened with default application.')
# else:
#     print(f'File not found at: {absolute_path}')

637


Unnamed: 0,player,drafted,yrs_played,teams,hof,wAV,earnings_mils,comp_%,pass_rating,comp,att,pass_yds,TD,pass_yds_game,int,int_%,pick_6,sacks,4QC,GWD
0,Tom Brady,2000.0,23.0,"['Patriots', 'Buccaneers']",0.0,184.0,317.62,64.3,97.2,7753.0,12050.0,89214.0,649.0,266.0,212.0,1.8,18.0,565.0,46.0,58.0
1,Drew Brees,2001.0,20.0,"['Saints', 'Chargers']",0.0,167.0,273.933,67.7,98.7,7142.0,10551.0,80358.0,571.0,280.0,243.0,2.3,27.0,420.0,36.0,53.0
2,Peyton Manning,1998.0,18.0,"['Colts', 'Broncos']",1.0,176.0,247.714,65.3,96.5,6125.0,9380.0,71940.0,539.0,270.0,251.0,2.7,27.0,303.0,43.0,54.0
3,Brett Favre,1991.0,20.0,"['Packers', 'Vikings', 'Jets', 'Falcons']",1.0,158.0,141.407,62.0,86.0,6300.0,10169.0,71838.0,508.0,237.0,336.0,3.3,32.0,525.0,28.0,43.0
4,Ben Roethlisberger,2004.0,18.0,['Steelers'],0.0,131.0,266.724,64.4,93.5,5440.0,8443.0,64088.0,418.0,257.0,211.0,2.5,18.0,554.0,41.0,53.0
