In [None]:
# Install dependencies (for Colab)
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import dependencies
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from transformers import pipeline
import tensorflow as tf
import numpy as np
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [None]:
# Check if a GPU is available
torch.cuda.is_available()

True

In [None]:
# Load the DataFrame from csv
df = pd.read_csv('data/player_bios.csv')

In [None]:
# Load data (for Colab)
df = pd.read_csv('https://raw.githubusercontent.com/tbryan2/nfl-prospects-nlp/main/data/player_bios.csv')

In [None]:
df

Unnamed: 0,Player,Player Bio,Player Grades,Link,Player Category
0,Jadeveon Clowney,A physical specimen with a rare size-speed com...,7.50,https://www.nfl.com/prospects/jadeveon-clowney...,Perennial All-Pro
1,Sammy Watkins,A legitimate No. 1-caliber receiver who steppe...,7.10,https://www.nfl.com/prospects/sammy-watkins/32...,Pro Bowl Talent
2,Anthony Barr,"A highly disruptive, athletic specimen with th...",7.00,https://www.nfl.com/prospects/anthony-barr/320...,Pro Bowl Talent
3,Khalil Mack,A havoc-wreaking rush linebacker with the burs...,7.00,https://www.nfl.com/prospects/khalil-mack/3200...,Pro Bowl Talent
4,Jake Matthews,"Smart, tough, versatile franchise left tackle ...",7.00,https://www.nfl.com/prospects/jake-matthews/32...,Pro Bowl Talent
...,...,...,...,...,...
3964,Jeremy Webb,Webb is a Florida product but helped Stevenson...,5.50,https://www.nfl.com/prospects/jeremy-webb/3200...,Priority Undrafted Free Agent
3965,Russ Yeast,"Russ is the son of Craig Yeast, who was the SE...",5.50,https://www.nfl.com/prospects/russ-yeast/32005...,Priority Undrafted Free Agent
3966,Ken Marks,"Marks was known as ""Grandpa"" on the field as a...",5.49,https://www.nfl.com/prospects/ken-marks/32004d...,Priority Undrafted Free Agent
3967,Devin Wynn,Wynn was coached by former Georgia and NFL run...,5.46,https://www.nfl.com/prospects/devin-wynn/32005...,Priority Undrafted Free Agent


In [None]:
# Create a new DataFrame to use for fine-tuning the model with just the Player Bio and Player Category columns
df_fine_tune = df[['Player Bio', 'Player Category']]

# Convert the player Category column to integers
df_fine_tune['Player Category'] = df_fine_tune['Player Category'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fine_tune['Player Category'] = df_fine_tune['Player Category'].astype('category').cat.codes


In [None]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Split the dataset into train and test sets
train = df_fine_tune.sample(frac=0.8, random_state=0)
test = df_fine_tune.drop(train.index)

In [None]:
# Preprocess the text in the train and test sets
train_encodings = tokenizer(train['Player Bio'].to_list(),
                            truncation=True,
                            padding=True,
                            return_tensors='tf'
)
test_encodings = tokenizer(test['Player Bio'].to_list(),
                            truncation=True,
                            padding=True,
                            return_tensors='tf'
)

# Define the training and testing labels
train_labels = train['Player Category'].to_list()
test_labels = test['Player Category'].to_list()

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

In [None]:
# Import a base zero shot classification model to fine tune
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=12)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

In [None]:
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=10,
          batch_size=16,
          validation_data=test_dataset.shuffle(100).batch(16))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6b1438d970>

In [None]:
model.save_pretrained("/usr/prospect_category_model")