# Fine-Tuning Transformer Models (OpenAI, Grok and Gemini) for Sentiment Analysis on Twitter Airline Data

# Part 1: Imports and Initial Setup

In [2]:
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer, 
    TrainingArguments,
)
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Set PyTorch memory optimization
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Force clear GPU memory at the start
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()


# Part 2: Dataset Download and Preprocessing

In [3]:
# Download dataset from Kaggle
os.system("kaggle datasets download -d crowdflower/twitter-airline-sentiment")
os.system("unzip -o twitter-airline-sentiment.zip -d ./airline_data")

# Load and preprocess dataset
df = pd.read_csv("./airline_data/Tweets.csv")

# Keep only positive and negative sentiments
df = df[df["airline_sentiment"].isin(["positive", "negative"])]

# Map labels to binary values
df["label"] = df["airline_sentiment"].map({"positive": 1, "negative": 0})

# Keep only required columns
df = df[["text", "label"]].dropna()

# Sample a subset (1000 rows)
df = df.sample(n=1000, random_state=42)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Split into train and eval
train_size = int(0.8 * len(dataset))
train_dataset = dataset.select(range(train_size))
eval_dataset = dataset.select(range(train_size, len(dataset)))


# Part 3: Model Selection & Tokenization Setup

In [4]:
# Define models to fine-tune
models = {
    "Gemini (FLAN-T5)": "google/flan-t5-small",
    "OpenAI (Stand-in)": "distilgpt2",
    "Grok (Stand-in)": "EleutherAI/gpt-neo-125m",
}

# Tokenization function
def tokenize_function(examples, tokenizer):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=128
    )

# Evaluation metrics
def compute_metrics(pred):
    logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
    labels = pred.label_ids
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


# Part 4: Fine-Tuning Loop for Each Model

In [8]:
# Fine-tune each model
for model_name, model_path in models.items():
    print(f"\nFine-tuning {model_name} ({model_path})...")

    if torch.cuda.is_available():
        print(f"GPU Memory - Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GiB, "
              f"Reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GiB")

    try:
        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForSequenceClassification.from_pretrained(
            model_path,
            num_labels=2,
            pad_token_id=tokenizer.pad_token_id,
            trust_remote_code=True
        )

        if torch.cuda.is_available():
            model = model.cuda()

        # Tokenize the datasets
        tokenized_train = train_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
        tokenized_eval = eval_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
        tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        tokenized_eval.set_format("torch", columns=["input_ids", "attention_mask", "label"])

        # Training settings
        training_args = TrainingArguments(
            output_dir=f"./results/{model_name.replace(' ', '_')}",
            num_train_epochs=2,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=20,
            weight_decay=0.01,
            logging_dir="./logs",
            logging_steps=10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            learning_rate=2e-5,
            fp16=True,
            report_to="none"
        )

        # Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_eval,
            compute_metrics=compute_metrics,
        )

        # Training
        trainer.train()

        # Evaluation
        eval_results = trainer.evaluate()
        print(f"Evaluation results for {model_name}: {eval_results}")

    except Exception as e:
        print(f"Error occurred while fine-tuning {model_name}: {e}")
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    finally:
        # Clean up memory
        if 'model' in locals():
            del model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            print(f"Memory cleared - Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GiB, "
                  f"Reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GiB")



Fine-tuning Gemini (FLAN-T5) (google/flan-t5-small)...
GPU Memory - Allocated: 0.00 GiB, Reserved: 0.00 GiB


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7416,0.656959,0.72,0.243243,0.225,0.264706
2,0.5694,0.497894,0.83,0.0,0.0,0.0


There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight', 'transformer.decoder.embed_tokens.weight'].


Evaluation results for Gemini (FLAN-T5): {'eval_loss': 0.4978941082954407, 'eval_accuracy': 0.83, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 3.5643, 'eval_samples_per_second': 56.113, 'eval_steps_per_second': 14.028, 'epoch': 2.0}
Memory cleared - Allocated: 0.75 GiB, Reserved: 0.80 GiB

Fine-tuning OpenAI (Stand-in) (distilgpt2)...
GPU Memory - Allocated: 0.75 GiB, Reserved: 0.80 GiB


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4172,0.331882,0.87,0.409091,0.9,0.264706
2,0.274,0.257175,0.895,0.655738,0.740741,0.588235


Evaluation results for OpenAI (Stand-in): {'eval_loss': 0.25717493891716003, 'eval_accuracy': 0.895, 'eval_f1': 0.6557377049180328, 'eval_precision': 0.7407407407407407, 'eval_recall': 0.5882352941176471, 'eval_runtime': 2.2377, 'eval_samples_per_second': 89.377, 'eval_steps_per_second': 22.344, 'epoch': 2.0}
Memory cleared - Allocated: 1.01 GiB, Reserved: 1.06 GiB

Fine-tuning Grok (Stand-in) (EleutherAI/gpt-neo-125m)...
GPU Memory - Allocated: 1.01 GiB, Reserved: 1.06 GiB


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3956,0.347843,0.87,0.48,0.75,0.352941
2,0.2366,0.287624,0.88,0.571429,0.727273,0.470588


Evaluation results for Grok (Stand-in): {'eval_loss': 0.2876235246658325, 'eval_accuracy': 0.88, 'eval_f1': 0.5714285714285714, 'eval_precision': 0.7272727272727273, 'eval_recall': 0.47058823529411764, 'eval_runtime': 3.886, 'eval_samples_per_second': 51.466, 'eval_steps_per_second': 12.867, 'epoch': 2.0}
Memory cleared - Allocated: 1.57 GiB, Reserved: 1.70 GiB


# Part 5: Save and Run Inference

In [19]:
for model_name, model_path in models.items():
 print(f"\nFine-tuning {model_name} ({model_path})...")

 if torch.cuda.is_available():
  print(f"GPU Memory - Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GiB, "
        f"Reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GiB")

 try:
  # Load model and tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
  if tokenizer.pad_token is None:
   tokenizer.pad_token = tokenizer.eos_token
  model = AutoModelForSequenceClassification.from_pretrained(
   model_path,
   num_labels=2,
   pad_token_id=tokenizer.pad_token_id,
   trust_remote_code=True
  )

  if torch.cuda.is_available():
   model = model.cuda()

  # Tokenize the datasets
  tokenized_train = train_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
  tokenized_eval = eval_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
  tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
  tokenized_eval.set_format("torch", columns=["input_ids", "attention_mask", "label"])

  # Training settings
  training_args = TrainingArguments(
   output_dir="./results",
   num_train_epochs=2,
   per_device_train_batch_size=2,
   per_device_eval_batch_size=2,
   gradient_accumulation_steps=4,
   warmup_steps=20,
   weight_decay=0.01,
   logging_dir="./logs",
   logging_steps=10,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
   learning_rate=2e-5,
   fp16=True,
   report_to="none"
  )

  # Trainer
  trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_eval,
   compute_metrics=compute_metrics,
  )

  # Train
  trainer.train()

  # Evaluate
  eval_results = trainer.evaluate()
  print(f"Evaluation results for {model_name}: {eval_results}")

  # Save fine-tuned model and tokenizer
  save_dir = f"./fine_tuned_{model_name.lower().replace(' ', '_')}"
  model.save_pretrained(save_dir)
  tokenizer.save_pretrained(save_dir)

  # Inference function
  def predict(text, model, tokenizer):
   inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
   inputs = {k: v.to(model.device) for k, v in inputs.items()}
   with torch.no_grad():
    outputs = model(**inputs)
   logits = outputs.logits
   prediction = torch.argmax(logits, dim=-1).item()
   return "Positive" if prediction == 1 else "Negative"

  # Sample prediction
  sample_text = "Great flight with excellent service!"
  print(f"Prediction for '{sample_text}' with {model_name}: {predict(sample_text, model, tokenizer)}")

 except torch.cuda.OutOfMemoryError:
  print(f"Out of memory error for {model_name}. Skipping this model.")
  if torch.cuda.is_available():
   torch.cuda.empty_cache()
  continue

 except ValueError as e:
  print(f"ValueError for {model_name}: {e}. Skipping this model.")
  if torch.cuda.is_available():
   torch.cuda.empty_cache()
  continue

 finally:
  # Clear GPU memory after each model
  if 'model' in locals():
   del model
  if torch.cuda.is_available():
   torch.cuda.empty_cache()
   print(f"Memory cleared - Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GiB, "
         f"Reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GiB")



Fine-tuning Gemini (FLAN-T5) (google/flan-t5-small)...
GPU Memory - Allocated: 1.57 GiB, Reserved: 1.70 GiB


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5228,0.435395,0.83,0.0,0.0,0.0
2,0.4493,0.404598,0.83,0.0,0.0,0.0


There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight', 'transformer.decoder.embed_tokens.weight'].


Evaluation results for Gemini (FLAN-T5): {'eval_loss': 0.40459808707237244, 'eval_accuracy': 0.83, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 3.9721, 'eval_samples_per_second': 50.351, 'eval_steps_per_second': 12.588, 'epoch': 2.0}


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Prediction for 'Great flight with excellent service!' with Gemini (FLAN-T5): Negative
Memory cleared - Allocated: 0.75 GiB, Reserved: 0.88 GiB

Fine-tuning OpenAI (Stand-in) (distilgpt2)...
GPU Memory - Allocated: 0.75 GiB, Reserved: 0.88 GiB


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4172,0.331882,0.87,0.409091,0.9,0.264706
2,0.274,0.257175,0.895,0.655738,0.740741,0.588235


Evaluation results for OpenAI (Stand-in): {'eval_loss': 0.25717493891716003, 'eval_accuracy': 0.895, 'eval_f1': 0.6557377049180328, 'eval_precision': 0.7407407407407407, 'eval_recall': 0.5882352941176471, 'eval_runtime': 2.1885, 'eval_samples_per_second': 91.387, 'eval_steps_per_second': 22.847, 'epoch': 2.0}
Prediction for 'Great flight with excellent service!' with OpenAI (Stand-in): Positive
Memory cleared - Allocated: 1.01 GiB, Reserved: 1.08 GiB

Fine-tuning Grok (Stand-in) (EleutherAI/gpt-neo-125m)...
GPU Memory - Allocated: 1.01 GiB, Reserved: 1.08 GiB


Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3956,0.347843,0.87,0.48,0.75,0.352941
2,0.2366,0.287624,0.88,0.571429,0.727273,0.470588


Evaluation results for Grok (Stand-in): {'eval_loss': 0.2876235246658325, 'eval_accuracy': 0.88, 'eval_f1': 0.5714285714285714, 'eval_precision': 0.7272727272727273, 'eval_recall': 0.47058823529411764, 'eval_runtime': 3.8524, 'eval_samples_per_second': 51.916, 'eval_steps_per_second': 12.979, 'epoch': 2.0}
Prediction for 'Great flight with excellent service!' with Grok (Stand-in): Positive
Memory cleared - Allocated: 1.58 GiB, Reserved: 1.68 GiB


# Conclusion:

In this notebook, we fine-tuned multiple transformer models, including FLAN-T5, GPT-2, and GPT-Neo, for sentiment analysis on Twitter airline data. We used the Hugging Face library for model loading, tokenization, training, and evaluation. After fine-tuning the models, we assessed their performance using an evaluation strategy based on epochs and saved the fine-tuned models for future inference tasks.