Fine Tuning GPT for sentiment analysis

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
print(df.head(200))  # For pandas

                                                review sentiment
0    One of the other reviewers has mentioned that ...  positive
1    A wonderful little production. <br /><br />The...  positive
2    I thought this was a wonderful way to spend ti...  positive
3    Basically there's a family where a little boy ...  negative
4    Petter Mattei's "Love in the Time of Money" is...  positive
..                                                 ...       ...
195  Phantasm ....Class. Phantasm II.....awesome. P...  negative
196  Ludicrous. Angelic 9-year-old Annakin turns in...  negative
197  Scotty (Grant Cramer, who would go on to star ...  negative
198  If you keep rigid historical perspective out o...  positive
199  The film quickly gets to a major chase scene w...  negative

[200 rows x 2 columns]


In [None]:
# Check the shape of the dataset
print("Dataset shape:", df.shape)

# Check the distribution of sentiments
print(df["sentiment"].value_counts())


Dataset shape: (50000, 2)
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


adding Neutral Reviews

In [None]:
df = df.reset_index(drop=True)

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training (80%), validation (10%), and test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the sizes of the splits
print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))

Training set size: 40000
Validation set size: 5000
Test set size: 5000


In [None]:
print("Training set:")
print(train_df.head())

print("Validation set:")
print(val_df.head())

print("Test set:")
print(test_df.head())

Training set:
                                                  review sentiment
39087  That's what I kept asking myself during the ma...  negative
30893  I did not watch the entire movie. I could not ...  negative
45278  A touching love story reminiscent of In the M...  positive
16398  This latter-day Fulci schlocker is a totally a...  negative
13653  First of all, I firmly believe that Norwegian ...  negative
Validation set:
                                                  review sentiment
47374  Hollywood has churned out yet another garbage ...  negative
48216  A trooper is on the side of the road making su...  negative
45929  If you like his show you might be a little dis...  positive
48715  A squashy slapstick mess posing as a comedy. E...  negative
18849  Lucio Fulci was famous for his Italian splatte...  negative
Test set:
                                                  review sentiment
25056  the tortuous emotional impact is degrading, wh...  negative
30334  Anyone who know

In [None]:
! pip install transformers



 1. Load the GPT-2 Tokenizer

In [None]:
from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token (GPT-2 doesn't have a padding token by default)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

**2. Tokenize the Datasets**

In [None]:
# Function to tokenize a batch of text
def tokenize_function(examples):
    return tokenizer(
        examples["review"],  # Text to tokenize
        padding="max_length",  # Pad to the maximum length
        truncation=True,  # Truncate to the maximum length
        max_length=512,  # Set the maximum length (GPT-2 supports up to 512 tokens)
        return_tensors="pt",  # Return PyTorch tensors
    )

# Tokenize the training set
train_tokenized = tokenize_function(train_df.to_dict("list"))

# Tokenize the validation set
val_tokenized = tokenize_function(val_df.to_dict("list"))

# Tokenize the test set
test_tokenized = tokenize_function(test_df.to_dict("list"))

**3. Add Labels to Tokenized Data**

In [None]:
import torch

# Ensure the 'label' column exists in the DataFrames
if "label" not in train_df.columns:
    label_map = {"positive": 0, "negative": 1, "neutral": 2}
    train_df["label"] = train_df["sentiment"].map(label_map)
    val_df["label"] = val_df["sentiment"].map(label_map)
    test_df["label"] = test_df["sentiment"].map(label_map)

# Add labels to the tokenized datasets
train_tokenized["labels"] = torch.tensor(train_df["label"].tolist())
val_tokenized["labels"] = torch.tensor(val_df["label"].tolist())
test_tokenized["labels"] = torch.tensor(test_df["label"].tolist())

# Save the tokenized datasets
torch.save(train_tokenized, "train_tokenized.pt")
torch.save(val_tokenized, "val_tokenized.pt")
torch.save(test_tokenized, "test_tokenized.pt")

print("Tokenized datasets saved to train_tokenized.pt, val_tokenized.pt, and test_tokenized.pt")

Tokenized datasets saved to train_tokenized.pt, val_tokenized.pt, and test_tokenized.pt


In [None]:
# Save the tokenized datasets
torch.save(train_tokenized, "train_tokenized.pt")
torch.save(val_tokenized, "val_tokenized.pt")
torch.save(test_tokenized, "test_tokenized.pt")

print("Tokenized datasets saved to train_tokenized.pt, val_tokenized.pt, and test_tokenized.pt")

Tokenized datasets saved to train_tokenized.pt, val_tokenized.pt, and test_tokenized.pt


In [None]:
# Load the tokenized datasets
train_data = torch.load("train_tokenized.pt")
val_data = torch.load("val_tokenized.pt")
test_data = torch.load("test_tokenized.pt")

# Check the first sample in the training set
print("First sample in the training set:")
print(train_data["input_ids"][0])  # Input IDs
print(train_data["attention_mask"][0])  # Attention mask
print(train_data["labels"][0])  # Label

  train_data = torch.load("train_tokenized.pt")


First sample in the training set:
tensor([ 2504,   338,   644,   314,  4030,  4737,  3589,  1141,   262,   867,
        11418,    11, 14788,  7466,    11, 38372,   290,  2276, 43744,   326,
        29298,   378,   262,  9508,  2431,    13,   383, 17909,   635,  1302,
          510,   618,   345,   892,   286,   262,   530,    12, 19577,  3435,
           11,   508,   423,   523,  1310,  6795,   326,   340,   318,  9826,
         5340,   284,  1337,   644,  4325,   284,   606,    13,  1119,   389,
          655, 11234,  3194,  3075,    79,  7084,   329,   262,  3437,   284,
         8181,   465, 34641,  9056,   319,    11,   257,  7243,   326,   468,
          587,  1760,   881,  1365,   287,   584, 43972,  1111,   319,  3195,
          290,   262, 22041, 29847,  1671,  1220,  6927,  1671, 11037,    40,
         1276, 22127,    11,   314,  1101,   407,  1107,   530,   329, 43853,
         2089, 13289,  1141,   257,  2646,    11,   475,   340,  1276,   307,
          531,   326, 12760,  

  val_data = torch.load("val_tokenized.pt")
  test_data = torch.load("test_tokenized.pt")


**# Step 3: Set Up the Fine-Tuning Environment.**

In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

**2. Load the Pre-trained GPT-2 Model**

In [None]:
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer

# Load the GPT-2 model for sequence classification
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)  # 3 labels for sentiment analysis

# Load the tokenizer (if not already loaded)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Set the padding token in the model's configuration
model.config.pad_token_id = tokenizer.pad_token_id

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**3. Prepare the Data for Training**

In [None]:
import torch

# Load the tokenized datasets
train_data = torch.load("train_tokenized.pt")
val_data = torch.load("val_tokenized.pt")
test_data = torch.load("test_tokenized.pt")

# Convert the datasets into a format compatible with Hugging Face's Trainer
from datasets import Dataset

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
test_dataset = Dataset.from_dict(test_data)

  train_data = torch.load("train_tokenized.pt")
  val_data = torch.load("val_tokenized.pt")
  test_data = torch.load("test_tokenized.pt")


**4. Set Up Training Arguments**

In [None]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save the model and logs
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.01,  # Weight decay for regularization
    save_strategy="epoch",  # Save the model after each epoch
    logging_dir="./logs",  # Directory for logs
    logging_steps=10,  # Log every 10 steps
    load_best_model_at_end=True,  # Load the best model at the end of training
)



**5. Define the Trainer**

In [None]:
from transformers import TrainingArguments, Trainer # Import the Trainer class

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save the model and logs
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.01,  # Weight decay for regularization
    save_strategy="epoch",  # Save the model after each epoch
    logging_dir="./logs",  # Directory for logs
    logging_steps=10,  # Log every 10 steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    report_to="none",  # Disable W&B logging
)
trainer = Trainer(
    model=model,  # The pre-trained model
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=val_dataset,  # Evaluation dataset
)



**6. Fine-Tune the Model**

In [None]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4166,0.259896
