In [1]:
import os
import pandas as pd
from transformers import pipeline
from tqdm import tqdm  # For progress tracking

# Suppress Symlink Warning
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Load classification pipelines
english_emotion_pipeline = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli"
)

hindi_emotion_pipeline = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli"
)

# Function to preprocess text (remove special characters, excessive whitespace)
def preprocess_text(text, max_tokens=512, drop_long_texts=False):
    text = ''.join(char if char.isalnum() or char.isspace() else ' ' for char in text)  # Remove special symbols
    text = ' '.join(text.split())  # Remove extra spaces

    tokenized_text = text.split()
    if len(tokenized_text) > max_tokens:
        if drop_long_texts:
            return None
        text = ' '.join(tokenized_text[:max_tokens])  # Truncate text

    return text.strip() if text.strip() else None


# Function to detect emotions in a batch
def detect_emotions_batch(texts, pipeline_func, max_tokens=512):
    labels = ["joy", "anger", "sadness", "fear", "love", "surprise"]  # Define emotion labels
    results = []
    for text in tqdm(texts, desc="Processing texts"):
        preprocessed_text = preprocess_text(text, max_tokens=max_tokens, drop_long_texts=True)
        if preprocessed_text is None:
            results.append(("DROPPED", 0.0))
            continue
        try:
            emotion_result = pipeline_func(preprocessed_text, candidate_labels=labels)
            dominant_emotion = max(zip(emotion_result['labels'], emotion_result['scores']), key=lambda x: x[1])
            results.append(dominant_emotion)
        except Exception as e:
            print(f"Error processing text: '{text[:50]}...' -> {e}")
            results.append(("UNKNOWN", 0.0))  # Fallback for errors
    return results


# Function to augment dataset with emotion labels for both English and Hindi
def augment_dataset_with_emotions(file_path, sample_size=None, max_tokens=512):
    data = pd.read_csv(file_path)

    if sample_size:
        data = data.sample(n=sample_size, random_state=42).reset_index(drop=True)

    data['english'] = data['english'].astype(str).fillna("")
    data['hindi'] = data['hindi'].astype(str).fillna("")

    # Detect emotions for English and Hindi with progress tracking
    print("Detecting emotions for English texts...")
    english_results = detect_emotions_batch(data['english'].tolist(), english_emotion_pipeline, max_tokens=max_tokens)
    
    print("Detecting emotions for Hindi texts...")
    hindi_results = detect_emotions_batch(data['hindi'].tolist(), hindi_emotion_pipeline, max_tokens=max_tokens)

    # Extract emotions and confidences
    english_emotions, english_confidences = zip(*english_results)
    hindi_emotions, hindi_confidences = zip(*hindi_results)

    # Add new columns
    data['English_Emotion'] = english_emotions
    data['English_Confidence'] = english_confidences
    data['Hindi_Emotion'] = hindi_emotions
    data['Hindi_Confidence'] = hindi_confidences

    # Drop rows where either emotion was marked as "DROPPED"
    data = data[(data['English_Emotion'] != "DROPPED") & (data['Hindi_Emotion'] != "DROPPED")]
    return data


# Function to prepare translation data
def prepare_translation_data(data):
    return [
        (row['english'], row['hindi'], row['English_Emotion'], row['English_Confidence'], row['Hindi_Emotion'], row['Hindi_Confidence'])
        for _, row in data.iterrows()
    ]


# Function to save the processed data
def save_training_data(data, output_path):
    df = pd.DataFrame(
        data,
        columns=['english', 'hindi', 'English_Emotion', 'English_Confidence', 'Hindi_Emotion', 'Hindi_Confidence']
    )
    df.to_csv(output_path, index=False)


# File paths
file_path = 'hindi_english_parallel.csv'  # Input file path
output_path = 'augmented_translation_data.csv'  # Output file path

# Use a smaller dataset for testing or full dataset
print("Augmenting dataset with emotion labels...")
augmented_data = augment_dataset_with_emotions(file_path, sample_size=500, max_tokens=512)

# Prepare and save the processed data
print("Preparing and saving the final dataset...")
final_data = prepare_translation_data(augmented_data)
save_training_data(final_data, output_path)

print("Process completed. Augmented dataset saved to:", output_path)


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassif

Augmenting dataset with emotion labels...
Detecting emotions for English texts...


Processing texts: 100%|██████████████████████████████████████████████████████████████| 500/500 [12:23<00:00,  1.49s/it]


Detecting emotions for Hindi texts...


Processing texts: 100%|████████████████████████████████████████████████████████████| 500/500 [1:42:13<00:00, 12.27s/it]

Preparing and saving the final dataset...
Process completed. Augmented dataset saved to: augmented_translation_data.csv





In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import random

# Load dataset
data_path = 'augmented_translation_data.csv'
df = pd.read_csv(data_path).dropna().sample(frac=1).reset_index(drop=True)
df = df[['english', 'hindi']]

# Tokenization
def tokenize(text):
    return text.lower().strip().split()

# Vocabulary Building
def build_vocab(texts):
    vocab = {"<unk>": 0, "<pad>": 1, "<sos>": 2, "<eos>": 3}
    for text in texts:
        for token in tokenize(text):
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

src_vocab = build_vocab(df['english'])
trg_vocab = build_vocab(df['hindi'])

# Save vocabularies
torch.save(src_vocab, "src_vocab.pth")
torch.save(trg_vocab, "trg_vocab.pth")

# Custom Dataset
class TranslationDataset(Dataset):
    def __init__(self, data, src_vocab, trg_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = tokenize(self.data.iloc[idx]['english'])
        trg = tokenize(self.data.iloc[idx]['hindi'])

        src_tensor = [self.src_vocab.get(token, 0) for token in ["<sos>"] + src + ["<eos>"]]
        trg_tensor = [self.trg_vocab.get(token, 0) for token in ["<sos>"] + trg + ["<eos>"]]

        return torch.tensor(src_tensor), torch.tensor(trg_tensor)

# Hyperparameters
BATCH_SIZE = 32
EMBED_DIM = 256
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# DataLoader
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=1, batch_first=False)
    trg_batch = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=1, batch_first=False)
    return src_batch, trg_batch

train_dataset = TranslationDataset(df, src_vocab, trg_vocab)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=1)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

# Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=1)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

# Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len, batch_size = trg.shape
        trg_vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0, :]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            top1 = output.argmax(1)
            input = trg[t] if random.random() < teacher_forcing_ratio else top1

        return outputs

# Model Setup
INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(trg_vocab)
encoder = Encoder(INPUT_DIM, EMBED_DIM, HID_DIM, N_LAYERS, DROPOUT).to(DEVICE)
decoder = Decoder(OUTPUT_DIM, EMBED_DIM, HID_DIM, N_LAYERS, DROPOUT).to(DEVICE)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

# Loss & Optimizer
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=1)

# Training Function
def train(model, loader, optimizer, criterion, device, clip=1.0):
    model.train()
    epoch_loss = 0

    for src, trg in loader:
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].reshape(-1, output_dim)
        trg = trg[1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(loader)

# Training Loop
for epoch in range(NUM_EPOCHS):
    loss = train(model, train_loader, optimizer, criterion, DEVICE)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss:.4f}")

# Save Model
torch.save(model.state_dict(), "translation_model.pth")
print("Model training complete and saved.")


Epoch 1/10, Loss: 7.4153
Epoch 2/10, Loss: 6.7807
Epoch 3/10, Loss: 6.6062
Epoch 4/10, Loss: 6.4686
Epoch 5/10, Loss: 6.3791
Epoch 6/10, Loss: 6.3083
Epoch 7/10, Loss: 6.2320
Epoch 8/10, Loss: 6.1721
Epoch 9/10, Loss: 6.1138
Epoch 10/10, Loss: 6.0532
Model training complete and saved.


In [2]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.metrics import accuracy_score

# Load Data
translated_data_path = 'translated_with_emotion_evaluation.csv'
data = pd.read_csv(translated_data_path).dropna(subset=["Original_Hindi", "Translated_Text"])  # Ensure no missing values

# Initialize Metrics
rouge = Rouge()
smoothie = SmoothingFunction().method4  # BLEU smoothing function

# Function to Compute BLEU Score
def compute_bleu(reference, hypothesis):
    if not isinstance(reference, str) or not isinstance(hypothesis, str) or not reference or not hypothesis:
        return 0.0  # Handle empty strings safely
    
    reference_tokens = [reference.split()]
    hypothesis_tokens = hypothesis.split()
    
    return sentence_bleu(reference_tokens, hypothesis_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

# Function to Compute ROUGE Score
def compute_rouge(reference, hypothesis):
    if not isinstance(reference, str) or not isinstance(hypothesis, str) or not reference or not hypothesis:
        return {"rouge-1": 0.0, "rouge-2": 0.0, "rouge-l": 0.0}
    
    scores = rouge.get_scores(hypothesis, reference)[0]  # Get ROUGE scores
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"]
    }

# Function to Evaluate Emotion Accuracy
def compute_emotion_accuracy(data):
    actual_emotions = data["Input_Emotion"].tolist()
    translated_emotions = data["Translated_Emotion"].tolist()
    
    # Filter out cases where the translated emotion is "UNKNOWN"
    valid_indices = [i for i in range(len(actual_emotions)) if translated_emotions[i] != "UNKNOWN"]
    
    if not valid_indices:
        return 0.0  # Avoid division by zero

    actual_filtered = [actual_emotions[i] for i in valid_indices]
    translated_filtered = [translated_emotions[i] for i in valid_indices]

    return accuracy_score(actual_filtered, translated_filtered)

# Compute Scores for Each Row
bleu_scores = []
rouge1_scores, rouge2_scores, rougel_scores = [], [], []

for _, row in data.iterrows():
    reference = row["Original_Hindi"]
    hypothesis = row["Translated_Text"]

    bleu_scores.append(compute_bleu(reference, hypothesis))
    rouge_scores = compute_rouge(reference, hypothesis)
    
    rouge1_scores.append(rouge_scores["rouge-1"])
    rouge2_scores.append(rouge_scores["rouge-2"])
    rougel_scores.append(rouge_scores["rouge-l"])

# Compute Averages
avg_bleu = np.mean(bleu_scores)
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougel = np.mean(rougel_scores)
emotion_accuracy = compute_emotion_accuracy(data)

# Display Evaluation Results
print("\n===== TRANSLATION EVALUATION RESULTS =====")
print(f"🔹 Average BLEU Score: {avg_bleu:.4f}")
print(f"🔹 Average ROUGE-1 Score: {avg_rouge1:.4f}")
print(f"🔹 Average ROUGE-2 Score: {avg_rouge2:.4f}")
print(f"🔹 Average ROUGE-L Score: {avg_rougel:.4f}")
print(f"🔹 Emotion Accuracy: {emotion_accuracy:.2%}")



===== TRANSLATION EVALUATION RESULTS =====
🔹 Average BLEU Score: 0.1969
🔹 Average ROUGE-1 Score: 0.3815
🔹 Average ROUGE-2 Score: 0.2190
🔹 Average ROUGE-L Score: 0.3650
🔹 Emotion Accuracy: 83.70%


In [None]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm  # Import tqdm for progress tracking

# Load NLLB-200 Model (Facebook's Translation Model)
MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Function to translate English to Hindi
def translate_sentence(text, src_lang="eng_Latn", tgt_lang="hin_Deva"):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs["forced_bos_token_id"] = tokenizer.convert_tokens_to_ids(tgt_lang) 
    translated_tokens = model.generate(**inputs)
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

# Load Data
data_path = "translated_with_emotion_evaluation.csv"
data = pd.read_csv(data_path)

# Check column names
print("CSV Columns:", data.columns.tolist())

# Ensure correct column names
expected_columns = ["Original_Hindi", "Original_English"]
data = data.dropna(subset=expected_columns)

# Translate all English sentences with a progress bar
translated_texts = []
for _, row in tqdm(data.iterrows(), total=len(data), desc="Translating", unit="sentence"):
    english_text = row["Original_English"]
    translated_text = translate_sentence(english_text)
    translated_texts.append(translated_text)

# Add new translations to the DataFrame
data["Translated_Text"] = translated_texts

# Save to a new file
new_file_path = "nllb_translated_output.csv"
data.to_csv(new_file_path, index=False)

print(f"✅ Translation completed! Saved results to {new_file_path}.")


CSV Columns: ['Original_English', 'Original_Hindi', 'Translated_Text', 'Input_Emotion', 'Translated_Emotion', 'Confidence_Deviation']


Translating: 100%|██████████| 497/497 [26:30<00:00,  3.20s/sentence] 

✅ Translation completed! Saved results to nllb_translated_output.csv.





In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load NLLB-200 Model (Facebook's Translation Model)
MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Function to translate English to Hindi
def translate_sentence(text, src_lang="eng_Latn", tgt_lang="hin_Deva"):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs["forced_bos_token_id"] = tokenizer.convert_tokens_to_ids(tgt_lang)
    
    with torch.no_grad():  # No need to track gradients
        translated_tokens = model.generate(**inputs)
    
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

# Interactive Loop
print("🔹 English-to-Hindi Translator (Type 'exit' to quit) 🔹")
while True:
    text = input("\nEnter English text: ")
    if text.lower() == "exit":
        print("👋 Exiting translator. Have a great day!")
        break
    translation = translate_sentence(text)
    print(f"📝 Translated: {translation}")


🔹 English-to-Hindi Translator (Type 'exit' to quit) 🔹
📝 Translated: मैं तुमसे नफरत करता हूँ
📝 Translated: मैं आप की प्रशंसा करता हूँ
👋 Exiting translator. Have a great day!


In [1]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.metrics import accuracy_score

# Load Data
translated_data_path = "nllb_translated_output.csv"  # Updated file path
data = pd.read_csv(translated_data_path).dropna(subset=["Original_Hindi", "Translated_Text"])  # Ensure no missing values

# Initialize Metrics
rouge = Rouge()
smoothie = SmoothingFunction().method4  # BLEU smoothing function

# Function to Compute BLEU Score
def compute_bleu(reference, hypothesis):
    if not isinstance(reference, str) or not isinstance(hypothesis, str) or not reference or not hypothesis:
        return 0.0  # Handle empty strings safely
    
    reference_tokens = [reference.split()]
    hypothesis_tokens = hypothesis.split()
    
    return sentence_bleu(reference_tokens, hypothesis_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

# Function to Compute ROUGE Score
def compute_rouge(reference, hypothesis):
    if not isinstance(reference, str) or not isinstance(hypothesis, str) or not reference or not hypothesis:
        return {"rouge-1": 0.0, "rouge-2": 0.0, "rouge-l": 0.0}
    
    scores = rouge.get_scores(hypothesis, reference)[0]  # Get ROUGE scores
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"]
    }

# Function to Evaluate Emotion Accuracy
def compute_emotion_accuracy(data):
    if "Input_Emotion" not in data.columns or "Translated_Emotion" not in data.columns:
        print("⚠️ Emotion columns not found, skipping emotion accuracy calculation.")
        return None

    actual_emotions = data["Input_Emotion"].tolist()
    translated_emotions = data["Translated_Emotion"].tolist()
    
    # Filter out cases where the translated emotion is "UNKNOWN"
    valid_indices = [i for i in range(len(actual_emotions)) if translated_emotions[i] != "UNKNOWN"]
    
    if not valid_indices:
        return 0.0  # Avoid division by zero

    actual_filtered = [actual_emotions[i] for i in valid_indices]
    translated_filtered = [translated_emotions[i] for i in valid_indices]

    return accuracy_score(actual_filtered, translated_filtered)

# Compute Scores for Each Row
bleu_scores = []
rouge1_scores, rouge2_scores, rougel_scores = [], [], []

for _, row in data.iterrows():
    reference = row["Original_Hindi"]
    hypothesis = row["Translated_Text"]

    bleu_scores.append(compute_bleu(reference, hypothesis))
    rouge_scores = compute_rouge(reference, hypothesis)
    
    rouge1_scores.append(rouge_scores["rouge-1"])
    rouge2_scores.append(rouge_scores["rouge-2"])
    rougel_scores.append(rouge_scores["rouge-l"])

# Compute Averages
avg_bleu = np.mean(bleu_scores)
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougel = np.mean(rougel_scores)
emotion_accuracy = compute_emotion_accuracy(data)

# Display Evaluation Results
print("\n===== TRANSLATION EVALUATION RESULTS =====")
print(f"🔹 Average BLEU Score: {avg_bleu:.4f}")
print(f"🔹 Average ROUGE-1 Score: {avg_rouge1:.4f}")
print(f"🔹 Average ROUGE-2 Score: {avg_rouge2:.4f}")
print(f"🔹 Average ROUGE-L Score: {avg_rougel:.4f}")

if emotion_accuracy is not None:
    print(f"🔹 Emotion Accuracy: {emotion_accuracy:.2%}")



===== TRANSLATION EVALUATION RESULTS =====
🔹 Average BLEU Score: 0.1238
🔹 Average ROUGE-1 Score: 0.3669
🔹 Average ROUGE-2 Score: 0.1680
🔹 Average ROUGE-L Score: 0.3465
🔹 Emotion Accuracy: 83.70%
