In [14]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW

Freezing Parameters to deal with computation issue

In [15]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

# Load the configuration, model, and tokenizer
configuration = GPT2Config.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Freeze parameters in the first few layers of the transformer
for i, (name, param) in enumerate(model.transformer.h.named_parameters()):
    # Example: Freeze the first 6 layers (you can adjust the range as needed)
    if "h." + str(i) in name:
        param.requires_grad = False

# Verify which layers are frozen
for name, param in model.named_parameters():
    if not param.requires_grad:
        print(f"Layer frozen: {name}")  # This will print the names of parameters that are frozen

# Continue with your training setup here



In [16]:
from torch.utils.data import Dataset, DataLoader
import torch
class GPT2Dataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for text in texts:
            encodings_dict = tokenizer('<|startoftext|>'+ text + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


In [17]:
df = pd.read_csv("./Data/Actual_Quarter.csv")
df

Unnamed: 0,GameID,Quarter,GameText,Label,CleanGameText
0,2015-10-27_CLE_CHI,1,695 N. Miroti misses 2-pt layup from 2 ft:694 ...,720 Jump ball: P. Gasol vs. T. Mozgov (D. Rose...,N. Miroti misses 2pt layup from ft. Defensiv...
1,2015-10-27_CLE_CHI,2,700 Defensive rebound by T. Gibson:685 D. Rose...,701 J. Cunningham misses 2-pt jump shot from 2...,Defensive rebound by T. Gibson. D. Rose makes...
2,2015-10-27_CLE_CHI,3,675 Turnover by D. Rose (bad pass; steal by L....,698 L. James makes 2-pt layup from 2 ft,Turnover by D. Rose bad pass. steal by L. Jam...
3,2015-10-27_CLE_CHI,4,695 Defensive rebound by T. Thompson:690 M. Wi...,696 E. Moore misses 2-pt jump shot from 16 ft,Defensive rebound by T. Thompson. M. Williams...
4,2015-10-27_DET_ATL,1,701 A. Drummond misses 2-pt layup from 1 ft (b...,720 Jump ball: A. Drummond vs. A. Horford (E. ...,A. Drummond misses 2pt layup from ft block by...
...,...,...,...,...,...
5353,2016-06-16_GSW_CLE,4,703 L. James makes free throw 2 of 2:703 L. Ja...,703 Shooting foul by H. Barnes (drawn by L. Ja...,L. James makes free throw of . L. James make...
5354,2016-06-19_CLE_GSW,1,699 J. Smith misses 2-pt jump shot from 5 ft:6...,720 Jump ball: F. Ezeli vs. T. Thompson (L. Ja...,J. Smith misses 2pt jump shot from ft. Defen...
5355,2016-06-19_CLE_GSW,2,686 R. Jefferson misses 2-pt jump shot from 5 ...,702 S. Livingston makes 2-pt jump shot from 16...,R. Jefferson misses 2pt jump shot from ft blo...
5356,2016-06-19_CLE_GSW,3,709 Defensive rebound by K. Love:698 J. Smith ...,709 F. Ezeli misses 2-pt jump shot from 3 ft,Defensive rebound by K. Love. J. Smith makes ...


Could add in a feature that Has all the player names in this game, could help with seeding

Prepare Training Data

In [18]:
from torch.utils.data import DataLoader, TensorDataset
import torch

# Ensure the tokenizer's pad_token_id is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token

# Initialize lists to store tokenized inputs
input_ids = []
attention_masks = []
labels = []

# Specify the number of instances you want to process
num_instances = min(len(df), 500) 

# Process instances
for idx in range(num_instances):
    single_row = df.iloc[idx]
    input_text = single_row['CleanGameText']
    target_text = single_row['Label']

    # Tokenize input and target texts. Note: No need for return_tensors='pt' here since we are appending to a list
    input_tokens = tokenizer.encode(input_text, truncation=True, max_length=1024)
    target_tokens = tokenizer.encode(target_text, truncation=True, max_length=1024)
    
    # Combine input and target tokens
    combined_tokens = input_tokens + [tokenizer.eos_token_id] + target_tokens
    
    # Append combined tokens to input_ids list
    input_ids.append(torch.tensor(input_tokens))
    
    # Create an attention mask for this sequence
    attn_mask = [1] * len(input_ids)  # All tokens should be attended to
    attention_masks.append(torch.tensor(attn_mask))
    
    # Use the same combined tokens as labels but with a shift
    labels.append(torch.tensor(target_tokens))

# Padding sequences and creating tensor datasets
input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 will be ignored by loss function

# Move tensors to the appropriate device
device = torch.device("cpu")
input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

# Create DataLoader
train_data = TensorDataset(input_ids, attention_masks, labels)
train_dataloader = DataLoader(train_data, batch_size=2)  # Adjust the batch size as needed


This will take awhile

In [19]:
# Assuming input_text is your text data and target_text is the expected output for language modeling

# Tokenize both input and output texts, ensuring the output is correctly aligned with the input
input_encodings = tokenizer(input_text, truncation=True, padding='max_length', max_length=98, return_tensors="pt")
output_encodings = tokenizer(target_text, truncation=True, padding='max_length', max_length=98, return_tensors="pt")

# Prepare data
input_ids = input_encodings.input_ids
attention_masks = input_encodings.attention_mask
labels = output_encodings.input_ids  # If the output tokens are expected as labels

# Check shapes
print("Input IDs shape:", input_ids.shape)
print("Attention Masks shape:", attention_masks.shape)
print("Labels shape:", labels.shape)

# Assuming you're using these for creating a DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
train_dataloader = DataLoader(dataset, batch_size=2)


Input IDs shape: torch.Size([1, 98])
Attention Masks shape: torch.Size([1, 98])
Labels shape: torch.Size([1, 98])


training Loop

In [33]:
from tqdm.auto import tqdm
import torch
from transformers import AdamW, GPT2Model, GPT2Tokenizer

# Assuming 'model' and 'tokenizer' are already initialized and loaded

# Set the model to training mode and ensure it's on the correct device
model.train()
device = torch.device("cpu")  # Change to 'cuda' if GPU is available
model.to(device)

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Number of training epochs
epochs = 3  # Adjust the number of epochs as necessary

# Create a progress bar for the epochs
progress_bar = tqdm(range(epochs), desc="Epochs", leave=False)

for epoch in progress_bar:
    for batch in train_dataloader:
        # Unpack the batch data
        input_ids, attention_masks, labels = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)

        # Zero the gradients on the optimizer
        optimizer.zero_grad()

        # Forward pass: compute the model output
        outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss

        # Backward pass: compute the gradient of the loss w.r.t. the model parameters
        loss.backward()

        # Update the model weights
        optimizer.step()

        # Update the progress bar with the latest loss
        progress_bar.set_postfix(loss=loss.item(), refresh=True)

    # Optionally save the model after each epoch
    

# Output the completion of training
print("Training complete and model saved.")


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Training complete and model saved.


The kernel crashes when trying to do more than 100 ish rows

In [None]:
output_dir = "Early_Predictor"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

Model saved to Early_Predictor


In [23]:
model.eval()  # Ensure the model is in evaluation mode

# Choose a number of instances to test, for example, 5
num_test_instances = min(len(df), 5)

for idx in range(num_test_instances):
    test_row = df.iloc[idx]
    test_input_text = test_row['CleanGameText']
    actual_label_text = test_row['Label']  # The actual label (text) you want to predict

    # Tokenize test input
    test_input_tokens = tokenizer.encode(test_input_text, return_tensors='pt').to(device)

    # Assuming your model and tokenizer are correctly set up to handle single instances
    attention_mask = torch.ones(test_input_tokens.shape, dtype=torch.long, device=test_input_tokens.device)

    # Generate text using the model
    with torch.no_grad():
        test_output_tokens = model.generate(
            input_ids=test_input_tokens,
            attention_mask=attention_mask,  # Now explicitly provided
            max_length=test_input_tokens.shape[1] + 20,  # Adjust as necessary
            temperature=1.0,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
            do_sample=True  # Enable sampling for more varied output
        )

        # Decode generated tokens back to text
        generated_text = tokenizer.decode(test_output_tokens[0], skip_special_tokens=True)

        # Display the input, actual, and generated texts
        print(f"Instance {idx+1}")
        print(f"Input text: {test_input_text}")
        print(f"Generated text: {generated_text[len(tokenizer.decode(test_input_tokens[0], skip_special_tokens=True)):]}")  # Show generated continuation
        print(f"Actual text: {actual_label_text}")
        print("-" * 50)  # Separator for readability


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


IndexError: index out of range in self

In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
#tokenizer = GPT2Tokenizer.from_pretrained("./Simple_Quarter")

# Load the model
#model = GPT2LMHeadModel.from_pretrained("./Simple_Quarter")
#model.eval()