In [35]:
from transformers import GPT2Model, GPT2Config, GPT2Tokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Assuming new_df_with_game_data is the DataFrame created using your consolidate_game_data function

class GameDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Preprocess text data


In [36]:
# Set the pad token to the eos_token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Now your tokenizer call should work without errors


In [37]:
new_df_with_game_data = pd.read_csv("./Data/Winners.csv")
texts = new_df_with_game_data['GameText'].tolist()
labels = new_df_with_game_data['WinningTeam']


In [38]:
df = new_df_with_game_data

In [None]:
encodings = tokenizer(texts, truncation=True, padding=True, max_length=1024)

In [25]:
new_df_with_game_data

Unnamed: 0,GameID,GameText,WinningTeam
0,2015-10-27_CLE_CHI,720 Jump ball: P. Gasol vs. T. Mozgov (D. Rose...,CHI
1,2015-10-27_DET_ATL,720 Jump ball: A. Drummond vs. A. Horford (E. ...,DET
2,2015-10-27_NOP_GSW,720 Jump ball: A. Bogut vs. A. Davis (N. Robin...,GSW
3,2015-10-28_CHI_BRK,720 Jump ball: P. Gasol vs. B. Lopez (T. Young...,CHI
4,2015-10-28_CHO_MIA,720 Jump ball: A. Jefferson vs. H. Whiteside (...,MIA
...,...,...,...
1311,2016-06-08_GSW_CLE,720 Jump ball: A. Bogut vs. T. Thompson (K. Th...,CLE
1312,2016-06-10_GSW_CLE,720 Jump ball: A. Bogut vs. T. Thompson (S. Cu...,GSW
1313,2016-06-13_CLE_GSW,720 Jump ball: A. Bogut vs. T. Thompson (S. Cu...,CLE
1314,2016-06-16_GSW_CLE,720 Jump ball: D. Green vs. T. Thompson (K. Lo...,CLE


In [34]:
len(new_df_with_game_data.iloc[0].GameText)


25801

In [19]:
labels

0       CHI
1       DET
2       GSW
3       CHI
4       MIA
       ... 
1311    CLE
1312    GSW
1313    CLE
1314    CLE
1315    CLE
Name: WinningTeam, Length: 1316, dtype: object

2

In [15]:
# Check the lengths of the input variables
print(f"Length of texts: {len(texts)}")
print(f"Length of labels: {len(labels)}")
print(f"Length of encodings: {len(encodings.input_ids)}")

# Ensure that the texts, labels, and encodings have the same number of samples
min_samples = min(len(texts), len(labels), len(encodings.input_ids))
texts = texts[:min_samples]
labels = labels[:min_samples]
encodings.input_ids = encodings.input_ids[:min_samples]
encodings.attention_mask = encodings.attention_mask[:min_samples]

# Split data into train and validation sets
train_encodings, val_encodings, train_labels, val_labels = train_test_split(
    encodings, labels, test_size=0.1, random_state=42)

# Create Dataset and DataLoader for training and validation
train_dataset = GameDataset(train_encodings, train_labels)
val_dataset = GameDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Load GPT-2 model and set up training
configuration = GPT2Config.from_pretrained('gpt2', n_class=len(label_encoder.classes_))
model = GPT2Model(configuration)
model.classifier = torch.nn.Linear(configuration.n_embd, configuration.n_class)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop would go here (not provided in this snippet)
# ...

Length of texts: 1316
Length of labels: 2


AttributeError: 'dict' object has no attribute 'input_ids'

In [None]:
model.save_pretrained('path/to/save/model')
tokenizer.save_pretrained('path/to/save/tokenizer')

In [41]:
df

Unnamed: 0,GameID,GameText,WinningTeam
0,2015-10-27_CLE_CHI,720 Jump ball: P. Gasol vs. T. Mozgov (D. Rose...,CHI
1,2015-10-27_DET_ATL,720 Jump ball: A. Drummond vs. A. Horford (E. ...,DET
2,2015-10-27_NOP_GSW,720 Jump ball: A. Bogut vs. A. Davis (N. Robin...,GSW
3,2015-10-28_CHI_BRK,720 Jump ball: P. Gasol vs. B. Lopez (T. Young...,CHI
4,2015-10-28_CHO_MIA,720 Jump ball: A. Jefferson vs. H. Whiteside (...,MIA
...,...,...,...
1311,2016-06-08_GSW_CLE,720 Jump ball: A. Bogut vs. T. Thompson (K. Th...,CLE
1312,2016-06-10_GSW_CLE,720 Jump ball: A. Bogut vs. T. Thompson (S. Cu...,GSW
1313,2016-06-13_CLE_GSW,720 Jump ball: A. Bogut vs. T. Thompson (S. Cu...,CLE
1314,2016-06-16_GSW_CLE,720 Jump ball: D. Green vs. T. Thompson (K. Lo...,CLE


Trying Long Transformer now

In [44]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load tokenizer and model
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

# Assuming df is your DataFrame and has the columns 'text' for inputs and 'label' for outputs
texts = df['GameText'].tolist()
labels = LabelEncoder().fit_transform(df['WinningTeam'])  # Encode labels numerically

# Set the number of unique labels for the classifier
num_labels = len(set(labels))

# Tokenize text with a higher max_length
encodings = tokenizer(texts, max_length=4096, truncation=True, padding="max_length", return_tensors="pt")

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(encodings['input_ids'], labels, test_size=0.1)

# Create PyTorch dataset
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {'input_ids': self.encodings[idx], 'labels': self.labels[idx]}
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

# Load Longformer model configured for sequence classification with the correct number of labels
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=num_labels)

# Prepare for training
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Set up device
device = torch.device("cpu")
model.to(device)


# Save the model



pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LongformerForSequenceClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
          

In [45]:
# Creating subset datasets for just the first 10 instances for quick testing
subset_train_texts = train_texts[:10]  # First 10 instances
subset_train_labels = train_labels[:10]

# Create dataset and DataLoader for the subset
subset_train_dataset = TextDataset(subset_train_texts, subset_train_labels)
subset_train_loader = DataLoader(subset_train_dataset, batch_size=4, shuffle=True)

In [47]:
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Example of a mini-training loop for the subset
for batch in subset_train_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids=input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Loss: {loss.item()}")

# Saving the model if needed


Loss: 3.356382369995117
Loss: 3.440417528152466
Loss: 3.570096492767334


In [49]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' contains your dataset and 'WinningTeam' is the column with labels
labels = df['WinningTeam']  # Extract labels from the DataFrame

# Create a label encoder object
label_encoder = LabelEncoder()

# Fit label encoder and transform labels into numbers
encoded_labels = label_encoder.fit_transform(labels)

In [53]:
def predict_winner(text, model, tokenizer, label_encoder):
    # Ensure model is in evaluation mode
    model.eval()

    # Tokenize the text
    encoding = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=4096)

    # Move the tensor to the same device as the model
    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)

    # Predict
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Convert logits to probabilities (optional)
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # Get the predicted class index
    predicted_index = torch.argmax(probabilities, dim=-1).item()

    # Decode the predicted index to the class label
    predicted_label = label_encoder.inverse_transform([predicted_index])[0]

    return predicted_label, probabilities[0, predicted_index].item()

# Example usage
model.to("cpu")  # Ensure the model is on CPU if not using GPU for prediction
single_test_example = df['GameText'].iloc[-4]
predicted_winner, confidence = predict_winner(single_test_example, model, tokenizer, label_encoder)

print(f"Predicted Winner: {predicted_winner}")
print(f"Prediction Confidence: {confidence:.4f}")


Predicted Winner: HOU
Prediction Confidence: 0.0442


In [None]:
# Training loop



epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Training loss for epoch {epoch + 1}: {avg_train_loss}")

    # Validation phase
    model.eval()
    total_eval_loss = 0
    total_eval_accuracy = 0
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, labels=labels)

        loss = outputs.loss
        total_eval_loss += loss.item()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += (predictions == labels).float().mean()

    avg_val_loss = total_eval_loss / len(val_loader)
    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f"Validation Loss: {avg_val_loss}")
    print(f"Validation Accuracy: {avg_val_accuracy}")
