In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm
  device: Optional[torch.device] = torch.device("cuda"),


In [3]:
# Scraping data with Selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from datetime import datetime
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time

In [None]:
# def generate_response(user_input, fantasy_data):
#     # Tokenize user input
#     input_ids = tokenizer.encode(user_input, return_tensors='pt')

#     # Generate response using the model, incorporating fantasy data
#     # Pass the fantasy data as input to the model along with the user input
#     output = model.generate(input_ids, max_length=100, num_return_sequences=1,
#                             no_repeat_ngram_size=2, fantasy_data=fantasy_data)

#     # Decode and return the response
#     response = tokenizer.decode(output[0], skip_special_tokens=True)
#     return response

In [None]:
# from transformers import GPT2LMHeadModel

# class FantasyGPT2Model(GPT2LMHeadModel):
#     def forward(self, input_ids, fantasy_data=None, **kwargs):
#         # Embed the fantasy data and concatenate it with the input embeddings
#         fantasy_embeddings = self.fantasy_embedding_layer(fantasy_data)
#         input_embeddings = self.transformer.wte(input_ids)
#         combined_embeddings = torch.cat([input_embeddings, fantasy_embeddings], dim=1)

#         # Pass the combined embeddings through the transformer
#         transformer_outputs = self.transformer(inputs_embeds=combined_embeddings, **kwargs)
#         lm_logits = self.lm_head(transformer_outputs[0])

#         return lm_logits


In [45]:
NFL_TEAMS = [
    "azcardinals",
    "atlantafalcons",
    "baltimoreravens",
    "buffalobills",
    "panthers",
    "chicagobears",
    "bengals",
    "clevelandbrowns",
    "dallascowboys",
    "denverbroncos",
    "detroitlions",
    "packers",
    "houstontexans",
    "colts",
    "jaguars",
    "chiefs",
    "raiders",
    "chargers",
    "therams",
    "miamidolphins",
    "vikings",
    "patriots",
    "neworleanssaints",
    "giants",
    "newyorkjets",
    "philadelphiaeagles",
    "steelers",
    "49ers",
    "seahawks",
    "buccaneers",
    "tennesseetitans",
    "commanders"
]


In [4]:
def new_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [41]:
def collect_nfl_team_stats(start_year):
    '''
    Collecting yearly NFL team stats, individual payer totals

    Args:
        driver (selenium)
        start_year (int)
    
    Returns:
        team_stats (dict)
    '''
    driver = new_driver()
    team_stats = {}
    for team in NFL_TEAMS:
        for year in range(start_year, datetime.now().year):
            try:
                driver.get(f'https://www.{team}.com/team/stats/{year}/REG')
            except Exception:
                print("Error occurred. Creating a new driver instance...")
                driver.quit()
                driver = new_driver()
                driver.get(f'https://www.{team}.com/team/stats/{year}/REG')
            team_stats[team + ' ' + str(year)] = collect_team(driver)

    return team_stats

In [29]:
def scrape_team(soup):
    # scraping stat names
    html = soup.find_all('ul', class_='nfl-o-team-h2h-stats__list')
    if len(html) > 0:
        html = html[0]
    else:
        return {}
    labels = []
    rel_tags = ['nfl-o-team-h2h-stats__label--full',  'nfl-o-team-h2h-stats__label--first-child nfl-o-team-h2h-stats__label--child',
    'nfl-o-team-h2h-stats__label--child', 'nfl-o-team-h2h-stats__label--last-child nfl-o-team-h2h-stats__label--child']
    # dict describing how many children stats a descriptor has
    descriptors = {'FIRST DOWNS': 3, 'OFFENSE': 2, 'RUSHING': 2, 'PASSING': 4, 'TDs': 4}
    for tag in html.find_all('span'):
        tag_classes = tag.get('class')
        if not tag_classes or any(cls in rel_tags for cls in tag_classes):
            text = tag.get_text(strip=True)
            if not is_number(text):
                labels.append(text)
    # organizing stat names to be keys
    labels[28] = 'TDs'
    new_labels = []
    descriptor_counter = 0
    descriptor = ''
    sub_list = []
    for label in labels:
        if label in ['Completions', 'Attempts', 'Interceptions', 'Average']:
            continue
        elif label in descriptors.keys() and descriptor_counter == 0:
            descriptor_counter += descriptors[label]
            descriptor = label
        elif descriptor_counter == 0:
            descriptor_list = []
            new_labels.extend(with_opponent(label))
        else:
            new_labels.append(descriptor + ' ' + label)
            sub_list.append(label)
            descriptor_counter -= 1
            if descriptor_counter == 0:
                for sub in sub_list:
                    new_labels.append(descriptor + ' ' + sub + ' Opponent')
                sub_list = []

    # getting the values of the stats   
    # Find the value and label elements within each element
    value_elements = html.find_all('div', class_='nfl-o-team-h2h-stats__value')
    # Extract and print the values and labels
    values = []
    for value_element in value_elements:
        if value_element.span:
            # Extract each individual value from the <span> tags
            span_values = [span.get_text(strip=True) for span in value_element.find_all('span')]
            values.extend(span_values)
        else:
            # If there are no <span> tags, extract the text directly
            value_text = value_element.get_text(strip=True)
            values.append(value_text)

    stats_dict = dict(zip(new_labels, values))
    return stats_dict

In [30]:
def table_scraper(html):
    data = {}
    headers = None

    for row in html.find_all('tr'):
        cells = row.find_all(['th', 'td'])
        if cells:
            if headers is None:
                headers = [cell.get_text(strip=True) for cell in cells[1:]]
            else:
                key = cells[0].get_text(strip=True)
                values = [cell.get_text(strip=True) for cell in cells[1:]]
                data[key] = dict(zip(headers, values))
    return data

In [31]:
def scrape_player(soup):
    player_stats = soup.find_all('div', class_='nfl-o-teamstats')
    passing = player_stats[0]
    rushing = player_stats[1]
    receiving = player_stats[2]
    player_data = {}
    player_data['passing'] = table_scraper(passing)
    player_data['rushing'] = table_scraper(rushing)
    player_data['receiving'] = table_scraper(receiving)
    return player_data
    

In [32]:
def collect_team(driver):
    '''
    parses html and produces json for the given year and team
    '''
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    team_stats = scrape_team(soup)
    player_stats = scrape_player(soup)
    team_stats.update(player_stats)
    return team_stats

    

In [33]:
def is_number(string):
    try:
        float(string)
        return True
    except ValueError:
        return False


In [34]:
def with_opponent(stat):
    return [stat, stat + ' Opponent']

In [35]:
def scrape_fp(pos, year):
    df = pd.read_html(f'https://www.fantasypros.com/nfl/stats/{pos}.php?year={year}scoring=HALF')[0]
    # Create a new list of column names with concatenated levels
    if pos in ['rb', 'wr', 'te', 'qb']:
        new_columns = []
        for col in df.columns:
            new_col = ''
            for level in col:
                if 'Unnamed' in level:
                    continue
                else:
                    new_col += level + ' '
            
            new_columns.append(new_col[:-1])

        # Assign the new column names to the DataFrame
        df.columns = new_columns
    data = []
    for _, row in df.iterrows():
        player_data = {}
        for column in df.columns:
            player_data[column] = row[column]
        data.append(player_data)
    return data

In [36]:
def scrape_fantasy_pros(start_year=2002):
    final_data = {}
    for year in range(start_year, datetime.now().year):
        for pos in ['rb', 'wr', 'te', 'k', 'dst']:
            final_data[pos + ' ' + str(year)] = (scrape_fp(pos, year))
    return final_data
            

In [25]:
def scrape_consensus_rankings():
    scoring_dict = {'consensus-cheatsheets': 'Standard Scoring',
                    'half-point-ppr-cheatsheets': 'Half PPR',
                    'ppr-cheatsheets': 'Full PPR'}
    final_data = {}
    for scoring_url in ['consensus-cheatsheets', 'half-point-ppr-cheatsheets', 'ppr-cheatsheets']:
        driver = new_driver()
        driver.get(f'https://www.fantasypros.com/nfl/rankings/{scoring_url}.php')
        # Wait for the rankings table to load (assuming it's loaded dynamically)
        table_locator = (By.CSS_SELECTOR, "#ranking-table")
        driver.implicitly_wait(10)  # Wait for up to 10 seconds for the element to appear
        table = driver.find_elements(*table_locator)
        
        # Scroll down to load all the records
        table_element = table[0]
        last_height = driver.execute_script("return arguments[0].scrollHeight", table_element)
        while True:
            driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight)", table_element)
            time.sleep(2)  # Adjust sleep time if needed
            new_height = driver.execute_script("return arguments[0].scrollHeight", table_element)
            if new_height == last_height:
                break
            last_height = new_height

        # Scrape the rankings
        rankings = []
        rows = table[0].find_elements(By.CSS_SELECTOR, "tbody tr.player-row")
        for row in rows:
            rank = row.find_element(By.CSS_SELECTOR, "td.sticky-cell-one").text
            player = row.find_element(By.CSS_SELECTOR, "td:nth-child(3) div.player-cell a").text
            team = row.find_element(By.CSS_SELECTOR, "td:nth-child(3) div.player-cell span.player-cell-team").text.strip("()")
            rankings.append({"Rank": rank, "Player": player, "Team": team})
        final_data[scoring_dict[scoring_url]] = rankings
    # Close the browser
    driver.quit()
    return final_data

In [26]:
consensus_rankings = scrape_consensus_rankings()

In [None]:
nfl_team_stats = collect_nfl_team_stats(2017)

In [48]:
fantasy_pros = scrape_fantasy_pros(2002)

In [None]:
consensus_rankings = scrape_consensus_rankings()

In [56]:
def generate_fp_text_from_dictionary(data):
    output = ""
    for position, players in data.items():
        output += f"{position.upper()}:\n"
        for player in players:
            output += f"Rank: {player['Rank']}\n"
            output += f"Player: {player['Player']}\n"
            for key, value in player.items():
                if key not in ['Rank', 'Player']:
                    output += f"{key}: {value}\n"
            output += "\n"
    return output

In [62]:
def generate_team_text_from_dictionary(data):
    output = ""
    for team, stats in data.items():
        output += f"{team.upper()}:\n"
        for key, value in stats.items():
            if isinstance(value, dict):
                output += f"{key.capitalize()}:\n"
                for player, player_stats in value.items():
                    output += f"{player}:\n"
                    for stat, stat_value in player_stats.items():
                        output += f"{stat}: {stat_value}\n"
                    output += "\n"
            else:
                output += f"{key}: {value}\n"
        output += "\n"
    return output

In [68]:
def process_consensus(data):
    output = ""
    for scoring_type, players in data.items():
        output += scoring_type + ":\n"
        output += "-" * 30 + "\n"
        output += "Rank  Player                 Team\n"
        output += "-" * 30 + "\n"
        for player in players:
            output += "{:<6} {:<22} {}\n".format(player['Rank'], player['Player'], player['Team'])
        output += "\n"
    return output

In [58]:
fp_text = generate_fp_text_from_dictionary(fantasy_pros)

In [63]:
team_text = generate_team_text_from_dictionary(nfl_team_stats)

In [69]:
consensus_text = process_consensus(consensus_rankings)

In [84]:
# # Save the string to a text file
# filename = "fp_text.txt"
# with open(filename, "w") as file:
#     file.write(fp_text)

# # Save the string to a text file
# filename = "team_text.txt"
# with open(filename, "w") as file:
#     file.write(team_text)

# # Save the string to a text file
# filename = "consensus_text.txt"
# with open(filename, "w") as file:
#     file.write(consensus_text)


In [86]:
# # Save the string to a text file
# filename = "fantasy_pros.json"
# data_string = json.dumps(fantasy_pros)
# with open(filename, "w") as file:
#     file.write(data_string)

# # Save the string to a text file
# data_string = json.dumps(nfl_team_stats)
# filename = "nfl_team_stats.json"
# with open(filename, "w") as file:
#     file.write(data_string)

# # Save the string to a text file
# data_string = json.dumps(consensus_rankings)
# filename = "consensus_rankings.json"
# with open(filename, "w") as file:
#     file.write(data_string)


In [2]:
import torch

# Load the pretrained model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [76]:
# Create a text dataset using the fantasy football data
fp = TextDataset(
    tokenizer=tokenizer,
    file_path=None,  # Pass None as we have the data in-memory
    text_string=fp_text,
    block_size=128  # Adjust the block size according to your data size
)

# Create a text dataset using the fantasy football data
consensus = TextDataset(
    tokenizer=tokenizer,
    file_path=None,  # Pass None as we have the data in-memory
    text_string=consensus_text,
    block_size=128  # Adjust the block size according to your data size
)

# Create a text dataset using the fantasy football data
team = TextDataset(
    tokenizer=tokenizer,
    file_path=None,  # Pass None as we have the data in-memory
    text_string=team_text,
    block_size=128  # Adjust the block size according to your data size
)

# combined dataset
combined_dataset = fp + consensus + team

# Define the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to True if you want to apply masked language modeling
)

# Configure the training arguments
training_args = TrainingArguments(
    output_dir='./retrained_model',  # Specify the output directory
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    num_train_epochs=5, 
    per_device_train_batch_size=4, 
    save_steps=500,  # Save the model checkpoints at every specified number of steps
    save_total_limit=2,  # Save a maximum of 2 checkpoints
)

# Initialize a new GPT-2 configuration
config = GPT2Config.from_pretrained(model_name)

# Instantiate a new GPT-2 model with the same configuration as the original model
model = GPT2LMHeadModel(config=config)

# Create a Trainer instance for training
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=combined_dataset
)

# Train the model
trainer.train()

# Save the retrained model
trainer.save_model('./retrained_model')


TypeError: TextDataset.__init__() got an unexpected keyword argument 'text_string'

In [83]:
from datasets import Dataset
Dataset.from_dict(nfl_team_stats)

ModuleNotFoundError: No module named 'datasets'