<a href="https://colab.research.google.com/github/yashika-8/pythia/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [100]:
!pip install torch transformers datasets evaluate scikit-learn




In [66]:
import os
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline,
    AutoModelForSequenceClassification
)
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_score, f1_score
import numpy as np

In [69]:
# Set a padding token for the tokenizer (use eos_token as pad_token if not defined)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token




In [67]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-410m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
# Update the model config to handle the padding token
model.config.pad_token_id = tokenizer.pad_token_id

In [50]:
def get_model_size(model_dir):
    total_size = 0
    for dirpath, _, filenames in os.walk(model_dir):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    return total_size / (1024 * 1024)

In [51]:
class PythiaForClassification(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.base_model = AutoModelForCausalLM.from_pretrained(model_name)
        self.classifier = nn.Linear(self.base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, -1, :])  # Use the last token's hidden state

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [52]:
train_df = pd.read_csv("train.csv", encoding="ISO-8859-1")
test_df = pd.read_csv("test.csv", encoding="ISO-8859-1")


In [60]:
label_mapping = {"positive": 1, "negative": 0, "neutral": 2}


In [61]:
train_df["sentiment"] = train_df["sentiment"].map(label_mapping)
test_df["sentiment"] = test_df["sentiment"].map(label_mapping)


In [62]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [41]:
model_name = "EleutherAI/pythia-410m"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})


In [42]:
original_model = AutoModelForCausalLM.from_pretrained(model_name)
original_model.save_pretrained("./original_model")
tokenizer.save_pretrained("./original_model")


('./original_model/tokenizer_config.json',
 './original_model/special_tokens_map.json',
 './original_model/tokenizer.json')

In [43]:
original_model_size = get_model_size("./original_model")
print(f"Original model size: {original_model_size:.2f} MB")


Original model size: 1549.69 MB


In [44]:
generator = pipeline("text-generation", model=original_model, tokenizer=tokenizer)
print("Text before quantization:")
print(generator("The weather today is", max_length=20, num_return_sequences=1)[0]["generated_text"])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Text before quantization:
The weather today is a bit of a mixed bag. The sun is shining, the temperature is in


In [45]:
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
quantized_model.save_pretrained("./quantized_model")


In [46]:
quantized_model_size = get_model_size("./quantized_model")
print(f"Quantized model size: {quantized_model_size:.2f} MB")

Quantized model size: 773.15 MB


In [54]:
generator_quantized = pipeline("text-generation", model=quantized_model, tokenizer=tokenizer)
print("Text after quantization:")
print(generator_quantized("The weather today is", max_length=20, num_return_sequences=1)[0]["generated_text"])

Device set to use cuda:0


Text after quantization:
The weather today is a bit of a mixed bag. The sun is shining, the temperature is in


In [81]:
def preprocess_function(examples):
    texts = [str(text) for text in examples["text"]]
    labels = examples.get("sentiment", None)

    if labels is None:
        raise ValueError("The dataset does not contain a 'sentiment' column.")

    # Replace NoneType labels with a default value (e.g., 0)
    labels = [label if label is not None else 0 for label in labels]

    # Convert labels to integers
    try:
        labels = [int(label) for label in labels]
    except ValueError as e:
        raise ValueError(f"Found invalid labels in data: {labels}") from e

    # Tokenize the texts
    tokenized = tokenizer(texts, padding="max_length", truncation=True)
    tokenized["labels"] = labels
    return tokenized


In [82]:
# Filter out rows where 'sentiment' is None
filtered_test_dataset = test_dataset.filter(lambda x: x["sentiment"] is not None)
tokenized_test = filtered_test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)


Filter:   0%|          | 0/4815 [00:00<?, ? examples/s]

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [83]:
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)


Map:   0%|          | 0/27481 [00:00<?, ? examples/s]

Map:   0%|          | 0/4815 [00:00<?, ? examples/s]

In [84]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    push_to_hub=False,
    fp16=True,  # Enable mixed precision for faster training
)



In [85]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.4429,2.090441
2,0.3447,2.488756
3,0.1719,3.18738


TrainOutput(global_step=10308, training_loss=0.3033022149055791, metrics={'train_runtime': 2742.2674, 'train_samples_per_second': 30.064, 'train_steps_per_second': 3.759, 'total_flos': 5002034583287808.0, 'train_loss': 0.3033022149055791, 'epoch': 3.0})

In [86]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/tokenizer.json')

In [88]:
predictions = trainer.predict(tokenized_test)
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1)

In [92]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Get predictions from the trainer
predictions = trainer.predict(tokenized_test)

# Extract predicted class indices
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# Extract true labels (if available in the dataset)
if hasattr(predictions, "label_ids"):
    labels = torch.tensor(predictions.label_ids)  # Ensure labels exist
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    f1 = f1_score(labels, preds, average="weighted")

    # Print metrics
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
else:
    print("No labels found in predictions. Metrics cannot be computed.")


Accuracy: 0.5638629283489096
Precision: 0.6814810353346556
Recall: 0.5638629283489096
F1 Score: 0.557131297743874


In [99]:
import torch

# Custom text for evaluation
custom_text = ["You are looking good!", "I am very disappointed."]

# Tokenize the input text with padding and truncation
tokenized_text = tokenizer(
    custom_text,
    padding="max_length",          # Ensures all sequences are of uniform length
    truncation=True,               # Truncates sequences longer than the model's max length
    max_length=128,                # Optional: set max length explicitly
    return_tensors="pt"            # Returns PyTorch tensors
)

# Move tokenized input to the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenized_text = {key: val.to(device) for key, val in tokenized_text.items()}

# Ensure the model is in evaluation mode
model.eval()

# Perform predictions
with torch.no_grad():
    outputs = model(**tokenized_text)

# Convert logits to predictions
preds = torch.argmax(outputs.logits, dim=1)

# Map predictions to sentiment labels
label_map = {0: "Negative", 1: "Positive", 2: "Neutral"}
predicted_labels = [label_map[pred.item()] for pred in preds]

# Display results
for text, label in zip(custom_text, predicted_labels):
    print(f"Text: {text}\nPredicted Sentiment: {label}\n")


Text: You are looking good!
Predicted Sentiment: Positive

Text: I am very disappointed.
Predicted Sentiment: Negative

