In [None]:
!pip install datasets

In [None]:
!pip install torch


In [None]:
!pip uninstall -y bitsandbytes
!pip install bitsandbytes==0.41.1  # Replace with the latest version if needed


In [None]:
import pandas as pd

# Load the CSV file
dataset = pd.read_csv('/content/Poetic_Devices_Final.csv')

print(dataset.head())

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!huggingface-cli login


In [None]:
pip install peft


In [None]:
pip install wandb

In [None]:
# -*- coding: utf-8 -*-
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import pandas as pd
import wandb

# Step 1: Define the model path
model_path = "mistralai/Mistral-7B-Instruct-v0.1"  # Replace with your Hugging Face model path



In [None]:
# Step 2: Configure and Load the Model with Int8 Quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable int8 quantization
    bnb_8bit_use_double_quant=True,  # Better accuracy
    bnb_8bit_quant_type="nf4",  # Quantization type
)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",  # Automatically map layers to devices
    quantization_config=quantization_config,  # Apply int8 quantization
)

In [None]:
# Step 3: Apply LoRA for Trainable Adapters
lora_config = LoraConfig(
    r=8,  # LoRA rank
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target LoRA layers
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



In [None]:
# Step 4: Load and Preprocess Dataset
df = pd.read_csv('/content/Poetic_Devices_Final.csv')
df['Text'] = df['Text'].astype(str)
df = df[df['Text'].str.strip() != '']  # Remove empty strings
dataset = Dataset.from_pandas(df)

def tokenize_function(examples):
    return tokenizer(examples["Text"], truncation=True)

tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenized_dataset = dataset.map(tokenize_function, batched=True)
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]



In [None]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["Text"],  # Replace "Text" with the correct column name from your dataset
        padding="max_length",
        truncation=True,
        max_length=128,  # Adjust max_length as needed
    )
    tokenized["labels"] = tokenized["input_ids"].copy()  # Add labels for causal LM
    return tokenized

# Tokenize the dataset (repeat this step to ensure labels are properly set)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [None]:
import wandb

# Initialize W&B
wandb.init(project="huggingface", name="Thousand_epoch_training")

## BEFORE TRAINING

In [None]:
import wandb

# Function to check model output and log to wandb
def test_model_output(model, tokenizer, input_prompt, stage="Before Training"):
    # Tokenize the input prompt
    tokenized_input = tokenizer(
        input_prompt,
        return_tensors="pt",  # Return PyTorch tensors
        padding="max_length",
        truncation=True,
        max_length=128,  # Ensure it matches training settings
    )

    # Get model outputs
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        outputs = model(**tokenized_input)

    # Decode the output for verification
    decoded_output = tokenizer.decode(
        torch.argmax(outputs.logits, dim=-1).squeeze().tolist(),
        skip_special_tokens=True
    )

    # Log input and output to wandb
    wandb.log({
        f"{stage} Input Prompt": input_prompt,
        f"{stage} Generated Output": decoded_output
    })

    print("Input Prompt:")
    print(input_prompt)
    print("\nGenerated Output:")
    print(decoded_output)

sample_prompt = "Dani, a dangerous dragon, decided to decimate the dwelling, dousing it with devilish daring and gasoline. What are the poetic devices you can find from this line and also explain their interplay? How do these devices evoke emotion and meaning?"
test_model_output(model, tokenizer, sample_prompt, stage="Before Training")



In [None]:
import wandb
from transformers import TrainingArguments, Trainer, TrainerCallback

# Initialize WandB project
wandb.init(project="my-model-training", name="After Training")

# Create a callback to log metrics
class MetricLogger(TrainerCallback):
    def __init__(self):
        self.metrics = []

    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            # Collect metrics
            metrics = {
                "epoch": state.epoch,
                "training_loss": logs.get("loss"),
                "eval_loss": logs.get("eval_loss")
            }
            self.metrics.append(metrics)

            # Log metrics to wandb
            wandb.log(metrics)

# Define training arguments with wandb integration
training_args = TrainingArguments(
    output_dir="./Training",
    evaluation_strategy="steps",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    fp16=False,
    save_strategy="steps",
    eval_steps=100,
    dataloader_num_workers=0,
    max_steps=1000,
    report_to="wandb"  # Enable wandb logging
)

# Instantiate the callback
metric_logger = MetricLogger()

# Define the Trainer with wandb logging
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[metric_logger]
)


In [None]:
print("\n Training...")
trainer.train()

In [None]:
# Save the fine-tuned model and tokenizer
print("\nSaving fine-tuned model...")
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
# Step 8: Generate Response AFTER Training

print("\nGenerating response after training...")
prompt = ("Dani, a dangerous dragon, decided to decimate the dwelling, dousing it with devilish daring and gasoline. What are the poetic devices you can find from this line and also explain their interplay? How do these devices evoke emotion and meaning?")
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
torch.cuda.empty_cache()

# Reduce memory-intensive parameters
outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=200,  # Shortened max length
    num_beams=3,  # Fewer beams
    temperature=0.7,
    repetition_penalty=1.2,
    top_p=0.9
)


response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nResponse after training:\n", response)

**Dataset Insights**

This block creates a bar chart for analyzing the frequency of categories or themes in the dataset:

In [None]:
# Assuming 'Category' and 'Poetic Devices' columns exist in the original dataset
category_counts = df['Category'].value_counts()

# Plot category frequencies
category_counts.plot(kind='bar', figsize=(10, 6), alpha=0.7, edgecolor='black')
plt.title('Category Frequencies in Dataset')
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# Optional: Repeat for 'Poetic Devices'
device_counts = df['Poetic Devices'].value_counts()
device_counts.plot(kind='bar', figsize=(10, 6), alpha=0.7, edgecolor='black')
plt.title('Poetic Devices Frequencies in Dataset')
plt.xlabel('Poetic Device')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()


**Training Metrics**

This block visualizes training and validation loss across epochs:

In [None]:
import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Define a prompt and tokenize it
prompt = "Dreams fell like autumn leaves, scattered in the winds of despair."
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Forward pass with attention outputs enabled
outputs = model(**inputs, output_attentions=True)

# Extract attention weights (last layer)
attention_weights = outputs.attentions[-1].squeeze(0).detach().cpu().numpy()  # Shape: (num_heads, seq_len, seq_len)

# Plot attention weights for a single head (choose the first head for simplicity)
head_attention = attention_weights[0]

# Create a heatmap for the attention weights
plt.figure(figsize=(12, 8))
sns.heatmap(head_attention, cmap='viridis', annot=False)
plt.title('Attention Heatmap (First Head)')
plt.xlabel('Input Tokens')
plt.ylabel('Output Tokens')
plt.show()


Sentiment Analysis

In [None]:
pip install transformers

In [None]:
from transformers import pipeline
import pandas as pd

# Load the dataset
file_path = '/content/Poetic_Devices_Final.csv'
data = pd.read_csv(file_path)

# Load a zero-shot classification model with GPU enabled
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

# Define the target emotions
target_emotions = ["love", "sorrow", "anger", "hatred"]

# Function to classify a batch of texts
def classify_emotion_batch(texts, threshold=0.5):
    results = classifier(texts, candidate_labels=target_emotions, multi_label=False)
    emotions = []
    confidences = []

    for res in results:
        # Check the top prediction
        top_label = res['labels'][0]
        top_score = res['scores'][0]

        # Assign "neutral" if confidence is below the threshold
        if top_score < threshold:
            emotions.append("neutral")
            confidences.append(top_score)
        else:
            emotions.append(top_label)
            confidences.append(top_score)

    return emotions, confidences

# Process the data in batches
batch_size = 16
emotions = []
confidences = []

for i in range(0, len(data), batch_size):
    batch_texts = data['Text'][i:i + batch_size].tolist()
    batch_emotions, batch_confidences = classify_emotion_batch(batch_texts)
    emotions.extend(batch_emotions)
    confidences.extend(batch_confidences)

# Add results to the dataframe
data['Emotion'] = emotions
data['Confidence'] = confidences

# Save the results to a new file
output_path = '/content/Zero_Shot_Emotion_Annotated_Neutral.csv'
data.to_csv(output_path, index=False)

# Preview the results
print(data.head())
print(f"Annotated dataset saved to: {output_path}")


In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
