In [1]:
# Scraping data with Selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time

In [2]:
NFL_TEAMS = [
    "azcardinals",
    "atlantafalcons",
    "baltimoreravens",
    "buffalobills",
    "panthers",
    "chicagobears",
    "bengals",
    "clevelandbrowns",
    "dallascowboys",
    "denverbroncos",
    "detroitlions",
    "packers",
    "houstontexans",
    "colts",
    "jaguars",
    "chiefs",
    "raiders",
    "chargers",
    "therams",
    "miamidolphins",
    "vikings",
    "patriots",
    "neworleanssaints",
    "giants",
    "newyorkjets",
    "philadelphiaeagles",
    "steelers",
    "49ers",
    "seahawks",
    "buccaneers",
    "tennesseetitans",
    "commanders"
]


In [3]:
def new_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [4]:
def scrape_team(soup):
    # scraping stat names
    html = soup.find_all('ul', class_='nfl-o-team-h2h-stats__list')
    if len(html) > 0:
        html = html[0]
    else:
        return {}
    labels = []
    rel_tags = ['nfl-o-team-h2h-stats__label--full',  'nfl-o-team-h2h-stats__label--first-child nfl-o-team-h2h-stats__label--child',
    'nfl-o-team-h2h-stats__label--child', 'nfl-o-team-h2h-stats__label--last-child nfl-o-team-h2h-stats__label--child']
    # dict describing how many children stats a descriptor has
    descriptors = {'FIRST DOWNS': 3, 'OFFENSE': 2, 'RUSHING': 2, 'PASSING': 4, 'TDs': 4}
    for tag in html.find_all('span'):
        tag_classes = tag.get('class')
        if not tag_classes or any(cls in rel_tags for cls in tag_classes):
            text = tag.get_text(strip=True)
            if not is_number(text):
                labels.append(text)
    # organizing stat names to be keys
    labels[28] = 'TDs'
    new_labels = []
    descriptor_counter = 0
    descriptor = ''
    sub_list = []
    for label in labels:
        if label in ['Completions', 'Attempts', 'Interceptions', 'Average']:
            continue
        elif label in descriptors.keys() and descriptor_counter == 0:
            descriptor_counter += descriptors[label]
            descriptor = label
        elif descriptor_counter == 0:
            descriptor_list = []
            new_labels.extend(with_opponent(label))
        else:
            new_labels.append(descriptor + ' ' + label)
            sub_list.append(label)
            descriptor_counter -= 1
            if descriptor_counter == 0:
                for sub in sub_list:
                    new_labels.append(descriptor + ' ' + sub + ' Opponent')
                sub_list = []

    # getting the values of the stats   
    # Find the value and label elements within each element
    value_elements = html.find_all('div', class_='nfl-o-team-h2h-stats__value')
    # Extract and print the values and labels
    values = []
    for value_element in value_elements:
        if value_element.span:
            # Extract each individual value from the <span> tags
            span_values = [span.get_text(strip=True) for span in value_element.find_all('span')]
            values.extend(span_values)
        else:
            # If there are no <span> tags, extract the text directly
            value_text = value_element.get_text(strip=True)
            values.append(value_text)

    stats_dict = dict(zip(new_labels, values))
    return stats_dict

In [5]:
def collect_team(driver):
    '''
    parses html and produces json for the given year and team
    '''
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    team_stats = scrape_team(soup)
    return team_stats

In [61]:
def collect_nfl_team_stats(start_year):
    '''
    Collecting yearly NFL team stats, individual payer totals

    Args:
        driver (selenium)
        start_year (int)
    
    Returns:
        team_stats (dict)
    '''
    driver = new_driver()
    team_stats = pd.DataFrame()
    for team in NFL_TEAMS:
        for year in range(start_year, datetime.now().year):
            try:
                driver.get(f'https://www.{team}.com/team/stats/{year}/REG')
            except Exception:
                print("Error occurred. Creating a new driver instance...")
                driver.quit()
                driver = new_driver()
                driver.get(f'https://www.{team}.com/team/stats/{year}/REG')
            stats_dict = {'team': team, 'year': year}
            stats_dict.update(collect_team(driver))
            team_stats = pd.concat([team_stats, pd.DataFrame([stats_dict])])

    return team_stats

In [8]:
def is_number(string):
    try:
        float(string)
        return True
    except ValueError:
        return False


In [9]:
def with_opponent(stat):
    return [stat, stat + ' Opponent']

In [50]:
def scrape_fp(pos, year):
    '''
    method for scraping a single year of fantasy stats for a position
    '''
    df = pd.read_html(f'https://www.fantasypros.com/nfl/stats/{pos}.php?year={year}scoring=HALF')[0]
    # Create a new list of column names with concatenated levels
    if pos in ['rb', 'wr', 'te', 'qb']:
        new_columns = []
        for col in df.columns:
            new_col = ''
            for level in col:
                if 'Unnamed' in level:
                    continue
                else:
                    new_col += level + ' '
            
            new_columns.append(new_col[:-1])

        # Assign the new column names to the DataFrame
        df.columns = new_columns
    df['year'] = year
    df['position'] = pos
    return df

In [51]:
def scrape_fantasy_pros(start_year=2002):
    '''
    scrapes all relevant seasons for each position
    '''
    qb = pd.DataFrame()
    rb = pd.DataFrame()
    wr = pd.DataFrame()
    te = pd.DataFrame()
    k = pd.DataFrame()
    dst = pd.DataFrame()
    for year in range(start_year, datetime.now().year):
        qb = pd.concat([qb, scrape_fp('qb', year)])
        rb = pd.concat([rb, scrape_fp('rb', year)])
        wr = pd.concat([wr, scrape_fp('wr', year)])
        te = pd.concat([te, scrape_fp('te', year)])
        k = pd.concat([k, scrape_fp('k', year)])
        dst = pd.concat([dst, scrape_fp('dst', year)])
    return qb, rb, wr, te, k, dst
            

In [73]:
def scrape_consensus_rankings():
    scoring_dict = {'consensus-cheatsheets': 'Standard Scoring',
                    'half-point-ppr-cheatsheets': 'Half PPR',
                    'ppr-cheatsheets': 'Full PPR'}
    final_data = pd.DataFrame()
    for scoring_url in ['consensus-cheatsheets', 'half-point-ppr-cheatsheets', 'ppr-cheatsheets']:
        driver = new_driver()
        driver.get(f'https://www.fantasypros.com/nfl/rankings/{scoring_url}.php')
        # Wait for the rankings table to load (assuming it's loaded dynamically)
        table_locator = (By.CSS_SELECTOR, "#ranking-table")
        driver.implicitly_wait(10)  # Wait for up to 10 seconds for the element to appear
        table = driver.find_elements(*table_locator)
        
        # Scroll down to load all the records
        table_element = table[0]
        last_height = driver.execute_script("return arguments[0].scrollHeight", table_element)
        while True:
            driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight)", table_element)
            time.sleep(2)  # Adjust sleep time if needed
            new_height = driver.execute_script("return arguments[0].scrollHeight", table_element)
            if new_height == last_height:
                break
            last_height = new_height

        # Scrape the rankings
        rankings = []
        rows = table[0].find_elements(By.CSS_SELECTOR, "tbody tr.player-row")
        for row in rows:
            rank = row.find_element(By.CSS_SELECTOR, "td.sticky-cell-one").text
            player = row.find_element(By.CSS_SELECTOR, "td:nth-child(3) div.player-cell a").text
            team = row.find_element(By.CSS_SELECTOR, "td:nth-child(3) div.player-cell span.player-cell-team").text.strip("()")
            pos = row.find_element(By.CSS_SELECTOR, "td:nth-child(4)").text
            rankings.append({"Scoring": scoring_dict[scoring_url], "Rank": rank, "Player": player, "Team": team, "Position": pos})
        final_data = pd.concat([final_data, pd.DataFrame(rankings)])
    # Close the browser
    driver.quit()
    return final_data

In [77]:
consensus_rankings

Unnamed: 0,Scoring,Rank,Player,Team,Position
0,Standard Scoring,1,Christian McCaffrey,SF,RB1
1,Standard Scoring,2,Bijan Robinson,ATL,RB2
2,Standard Scoring,3,Austin Ekeler,LAC,RB3
3,Standard Scoring,4,Jonathan Taylor,IND,RB4
4,Standard Scoring,5,Nick Chubb,CLE,RB5
...,...,...,...,...,...
498,Full PPR,499,Andrei Iosivas,CIN,WR163
499,Full PPR,500,C.J. Ham,MIN,RB142
500,Full PPR,501,Tyler Badie,DEN,RB143
501,Full PPR,502,Brandon Powell,MIN,WR164


In [74]:
consensus_rankings = scrape_consensus_rankings()

In [None]:
nfl_team_stats = collect_nfl_team_stats(2017)

In [58]:
qb, rb, wr, te, k, dst = scrape_fantasy_pros()

In [76]:
# consensus_rankings.to_csv('consensus_rankings.csv', index=False)

In [71]:
# qb.to_csv('qb_stats.csv', index=False)
# rb.to_csv('rb_stats.csv', index=False)
# wr.to_csv('wr_stats.csv', index=False)
# te.to_csv('te_stats.csv', index=False)
# k.to_csv('k_stats.csv', index=False)
# dst.to_csv('dst_stats.csv', index=False)

In [69]:
# nfl_team_stats.to_csv('nfl_team_stats.csv')

In [18]:
import os
from dotenv import load_dotenv
import openai

# Load environment variables from .env file
load_dotenv()

# Access the API key from the environment variables
api_key = os.getenv("OPENAI_API_KEY")

# Set the API key for the OpenAI library
openai.api_key = api_key

# Rest of your code that uses the OpenAI API


In [19]:
print(api_key)

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


## Using Langchain to build chatbot

In [None]:
from langchain.agents import create_pandas_dataframe_agent
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType

In [None]:
from langchain.llms import OpenAI
import pandas as pd

nfl_team_stats = pd.read_csv('nfl_team_stats.csv')
consensus_rankings = pd.read_csv('consensus_rankings.csv')
qb_stats = pd.read_csv('qb_stats.csv')
rb_stats = pd.read_csv('rb_stats.csv')
wr_stats = pd.read_csv('wr_stats.csv')
te_stats = pd.read_csv('te_stats.csv')
k_stats = pd.read_csv('k_stats.csv')
dst_stats = pd.read_csv('dst_stats.csv')


In [None]:
agent = create_pandas_dataframe_agent(OpenAI(temperature=0), [nfl_team_stats, consensus_rankings, qb_stats, rb_stats, wr_stats, te_stats, k_stats, dst_stats], verbose=True)

In [None]:
agent.run('who is the best fantasy qb this year?')