In [None]:
# Install dependencies (for Colab)
!pip install transformers
!pip install datasets

In [1]:
# Import dependencies
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from transformers import pipeline

2023-02-03 20:33:25.307423: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Check if a GPU is available
torch.cuda.is_available()

False

In [3]:
# Load the DataFrame from csv
df = pd.read_csv('data/player_bios.csv')

In [None]:
# Load data (for Colab)
df = pd.read_csv('https://raw.githubusercontent.com/tbryan2/nfl-prospects-nlp/main/data/player_bios.csv')

In [4]:
# Create a new column called player category that classifies players based on the NFL.com player grades
df['Player Category'] = pd.cut(df['Player Grades'],
                               bins=[0, 5.59, 5.69, 5.99, 6.09, 6.19,
                                     6.29, 6.39, 6.49, 6.6, 6.9, 7.1, 7.5, 8],
                               labels=['Priority Undrafted Free Agent',
                                       'Candidate for Bottom of Roster or Practice Squad',
                                       'Average Backup or Special Teamer',
                                       'Traits or Talents to be Above-Average Backup',
                                       'Good Backup with Potential to Develop into Starter',
                                       'Will Eventually be Average Starter',
                                       'Will Eventually be Plus Starter',
                                       'Will become good starter within two years',
                                       'Boom or Bust Potential',
                                       'Year One Starter',
                                       'Pro Bowl Talent',
                                       'Perennial All-Pro',
                                       'The Perfect Prospect'])


In [5]:
df

Unnamed: 0,Player,Player Bio,Player Grades,Link,Player Category
0,Jadeveon Clowney,A physical specimen with a rare size-speed com...,7.50,https://www.nfl.com/prospects/jadeveon-clowney...,Perennial All-Pro
1,Sammy Watkins,A legitimate No. 1-caliber receiver who steppe...,7.10,https://www.nfl.com/prospects/sammy-watkins/32...,Pro Bowl Talent
2,Anthony Barr,"A highly disruptive, athletic specimen with th...",7.00,https://www.nfl.com/prospects/anthony-barr/320...,Pro Bowl Talent
3,Khalil Mack,A havoc-wreaking rush linebacker with the burs...,7.00,https://www.nfl.com/prospects/khalil-mack/3200...,Pro Bowl Talent
4,Jake Matthews,"Smart, tough, versatile franchise left tackle ...",7.00,https://www.nfl.com/prospects/jake-matthews/32...,Pro Bowl Talent
...,...,...,...,...,...
3964,Jeremy Webb,Webb is a Florida product but helped Stevenson...,5.50,https://www.nfl.com/prospects/jeremy-webb/3200...,Priority Undrafted Free Agent
3965,Russ Yeast,"Russ is the son of Craig Yeast, who was the SE...",5.50,https://www.nfl.com/prospects/russ-yeast/32005...,Priority Undrafted Free Agent
3966,Ken Marks,"Marks was known as ""Grandpa"" on the field as a...",5.49,https://www.nfl.com/prospects/ken-marks/32004d...,Priority Undrafted Free Agent
3967,Devin Wynn,Wynn was coached by former Georgia and NFL run...,5.46,https://www.nfl.com/prospects/devin-wynn/32005...,Priority Undrafted Free Agent


In [6]:
# Create a new DataFrame to use for fine-tuning the model with just the Player Bio and Player Category columns
df_fine_tune = df[['Player Bio', 'Player Category']]

In [7]:
# Import a tokenizer from the transformers library
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")


Downloading: 100%|██████████| 26.0/26.0 [00:00<00:00, 3.37kB/s]
Downloading: 100%|██████████| 1.15k/1.15k [00:00<00:00, 430kB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 5.45MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 3.39MB/s]
Downloading: 100%|██████████| 1.36M/1.36M [00:00<00:00, 7.65MB/s]


In [8]:
# Split the dataset into train and test sets
train = df_fine_tune.sample(frac=0.8, random_state=0)
test = df_fine_tune.drop(train.index)

In [9]:
# Define a function to preprocess the text
def preprocess_function(examples):
   return tokenizer(examples["Player Bio"], truncation=True)


# Preprocess the text in the train and test sets
train_encodings = train.apply(preprocess_function, axis=1).reset_index()
test_encodings = test.apply(preprocess_function, axis=1).reset_index()

# Convert the train and test sets to arrays
#train_encodings = train_encodings.to_numpy()
#test_encodings = test_encodings.to_numpy()

In [10]:
# To speed up training, we will convert samples to tensors
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
# Import a base zero shot classification model to fine tune
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

Downloading: 100%|██████████| 1.63G/1.63G [00:34<00:00, 47.5MB/s]


: 

: 

In [None]:
import numpy as np
from datasets import load_metric


def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(
       predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
# Log into Hugging Face
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# See if there are any values missing in the index column
train_encodings.isnull().sum()

In [None]:
from transformers import TrainingArguments, Trainer

repo_name = "nfl-prospects"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=test_encodings,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()