In [5]:
# Imports 
import torch
import pandas as pd
import numpy as np
import copy
import sacrebleu
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments,
    MarianMTModel, MarianTokenizer, 
    get_linear_schedule_with_warmup
)
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
from tqdm.auto import tqdm
from sklearn.metrics import classification_report, accuracy_score



In [6]:
def check_gpu():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"GPU available: {torch.cuda.get_device_name(0)}")
        return device
    else:
        print("No GPU available, using CPU")
        return torch.device("cpu")

def load_balanced_data(path, sample_size=10000):
    columns = ['target', 'id', 'date', 'flag', 'user', 'text']
    df = pd.read_csv(path, encoding='latin-1', names=columns)
    pos_samples = df[df['target'] == 4].sample(n=sample_size//2, random_state=42)
    neg_samples = df[df['target'] == 0].sample(n=sample_size//2, random_state=42)
    return pd.concat([pos_samples, neg_samples]).reset_index(drop=True)

def load_model(lang_code, device):
    model_dir = f"./T5-{lang_code}-en"
    tokenizer = T5Tokenizer.from_pretrained(model_dir)
    model = T5ForConditionalGeneration.from_pretrained(model_dir)
    model = model.to(device)
    model.eval()  # Set model to evaluation mode
    return model, tokenizer

def predict_sentiment_batch(texts, model, tokenizer, device, batch_size=32):
    predictions = []
    model.eval()
    
    with torch.no_grad():  # Disable gradient calculation
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i + batch_size]
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            outputs = model.generate(**inputs)
            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            batch_predictions = [4 if 'positive' in pred.lower() else 0 for pred in decoded]
            predictions.extend(batch_predictions)
            
            # Clear CUDA cache periodically
            if device.type == "cuda" and i % (batch_size * 10) == 0:
                torch.cuda.empty_cache()
    
    return predictions

def evaluate_model(lang_code, test_data, device):
    model, tokenizer = load_model(lang_code, device)
    predictions = predict_sentiment_batch(test_data['text'].tolist(), model, tokenizer, device)
    
    accuracy = accuracy_score(test_data['target'], predictions)
    report = classification_report(test_data['target'], predictions)
    
    return {
        'predictions': predictions,
        'accuracy': accuracy,
        'report': report
    }


# Evaluating the T5 models on sentiment analysis 
# Check and set up GPU
device = check_gpu()

# Load balanced dataset
data = load_balanced_data('training.1600000.processed.noemoticon.csv', sample_size=10000)
print(f"Loaded {len(data)} tweets with distribution:\n{data['target'].value_counts()}")

# Languages to test
languages = ['fr', 'de', 'ro']

# Evaluate each model
for lang_code in languages:
    print(f"\nEvaluating {lang_code} model...")
    
    try:
        results = evaluate_model(lang_code, data, device)
        print(f"\nAccuracy: {results['accuracy']:.4f}")
        print("\nClassification Report:")
        print(results['report'])
        
        pd.DataFrame({
            'text': data['text'],
            'true_sentiment': data['target'],
            'predicted_sentiment': results['predictions']
        }).to_csv(f'sentiment_results_{lang_code}.csv', index=False)
        
    except Exception as e:
        print(f"Error processing {lang_code} model: {str(e)}")
        continue
    
    # Clear GPU memory after each model
    if device.type == "cuda":
        torch.cuda.empty_cache()

In [7]:
# Roberta-Base Model 
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class SentimentAnalyzer:
    def __init__(self, model_name="roberta-base", device=None):
        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=2,
            hidden_dropout_prob=0.3,
            attention_probs_dropout_prob=0.3
        ).to(self.device)

    def train(self, train_texts, train_labels, val_texts, val_labels, batch_size=32, epochs=3, learning_rate=2e-5):
       train_dataset = SentimentDataset(train_texts, train_labels, self.tokenizer)
       train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
       
       val_dataset = SentimentDataset(val_texts, val_labels, self.tokenizer)
       val_loader = DataLoader(val_dataset, batch_size=batch_size)
       
       # Compute class weights
       class_counts = np.bincount(train_labels)
       class_weights = torch.FloatTensor(1.0 / class_counts).to(self.device)
       
       # Use weighted loss
       criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
       
       # Reduce learning rate and add weight decay
       optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate, weight_decay=0.01)
       
       # Warmup and decay learning rate
       num_training_steps = len(train_loader) * epochs
       scheduler = get_linear_schedule_with_warmup(
           optimizer,
           num_warmup_steps=num_training_steps // 10,
           num_training_steps=num_training_steps
       )
       
       best_val_acc = 0
       best_model = None
       
       for epoch in range(epochs):
           self.model.train()
           total_loss = 0
           
           for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
               batch = {k: v.to(self.device) for k, v in batch.items()}
               
               optimizer.zero_grad()
               outputs = self.model(**batch)
               
               # Use weighted loss
               logits = outputs.logits
               labels = batch['labels']
               loss = criterion(logits, labels)
               
               loss.backward()
               torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
               optimizer.step()
               scheduler.step()
               
               total_loss += loss.item()
           
           # Validation with detailed metrics
           val_metrics = self.evaluate(val_texts, val_labels, batch_size)
           print(f'Epoch {epoch + 1} - Train Loss: {total_loss/len(train_loader):.4f}')
           print(f'Validation Accuracy: {val_metrics["accuracy"]:.4f}')
           print('Validation Report:')
           print(classification_report(
               val_labels, 
               self.predict(val_texts, batch_size),
               labels=[0, 1],
               target_names=['Negative', 'Positive']
           ))
           
           if val_metrics["accuracy"] > best_val_acc:
               best_val_acc = val_metrics["accuracy"]
               best_model = copy.deepcopy(self.model.state_dict())
       
       # Restore best model
       if best_model is not None:
           self.model.load_state_dict(best_model)
    
    def predict(self, texts, batch_size=32):
       self.model.eval()
       dataset = SentimentDataset(texts, [0] * len(texts), self.tokenizer)
       dataloader = DataLoader(dataset, batch_size=batch_size)
       
       predictions = []
       with torch.no_grad():
           for batch in dataloader:
               batch = {k: v.to(self.device) for k, v in batch.items()}
               outputs = self.model(**batch)
               # Apply temperature scaling for better calibration
               logits = outputs.logits / 1.5  # Temperature parameter
               preds = torch.softmax(logits, dim=1)
               predictions.extend(preds.argmax(dim=1).cpu().numpy())
       
       return predictions
    
    def evaluate(self, texts, labels, batch_size=32):
        predictions = self.predict(texts, batch_size)
        report = classification_report(labels, predictions, output_dict=True)
        return {
            "accuracy": report["accuracy"],
            "report": report
        }

def prepare_balanced_data(filepath, total_samples=200000):
    print("Loading data...")
    # Read initial chunk to get negative samples
    neg_data = pd.read_csv(filepath, encoding='latin-1', 
                          names=['target', 'id', 'date', 'flag', 'user', 'text'],
                          nrows=total_samples)
    
    # Skip to later part of file to get positive samples
    skiprows = lambda x: x > total_samples and x <= 2*total_samples
    pos_data = pd.read_csv(filepath, encoding='latin-1',
                          names=['target', 'id', 'date', 'flag', 'user', 'text'],
                          skiprows=skiprows)
    
    print("Class distribution in chunks:")
    print("First chunk:", neg_data['target'].value_counts())
    print("Second chunk:", pos_data['target'].value_counts())
    
    samples_per_class = total_samples // 2
    
    # Sample from each class
    try:
        neg_samples = neg_data[neg_data['target'] == 0].sample(n=samples_per_class, random_state=42)
    except ValueError:
        print("Not enough negative samples in first chunk")
        return None, None, None
        
    try:
        pos_samples = pos_data[pos_data['target'] == 4].sample(n=samples_per_class, random_state=42)
    except ValueError:
        print("Not enough positive samples in second chunk")
        return None, None, None
    
    # Combine and shuffle
    balanced_data = pd.concat([neg_samples, pos_samples])
    balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Split data
    train_data, temp_data = train_test_split(
        balanced_data, 
        train_size=0.7,
        random_state=42,
        stratify=balanced_data['target']
    )
    
    val_data, test_data = train_test_split(
        temp_data,
        test_size=0.5,
        random_state=42,
        stratify=temp_data['target']
    )
    
    print("\nFinal dataset sizes:")
    print(f"Training: {len(train_data)} ({train_data['target'].value_counts().to_dict()})")
    print(f"Validation: {len(val_data)} ({val_data['target'].value_counts().to_dict()})")
    print(f"Test: {len(test_data)} ({test_data['target'].value_counts().to_dict()})")
    
    return train_data, val_data, test_data


# # Training the Roberta-Base model 
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Prepare data
# train_data, val_data, test_data = prepare_balanced_data('training.1600000.processed.noemoticon.csv', total_samples=200000)
# print(f"\nData distribution:")
# print("Train:", train_data['target'].value_counts())
# print("Validation:", val_data['target'].value_counts())
# print("Test:", test_data['target'].value_counts())

# # Train model
# analyzer = SentimentAnalyzer(device=device)
# best_acc = analyzer.train(
#     train_texts=train_data['text'].tolist(),
#     train_labels=[(1 if t == 4 else 0) for t in train_data['target']],
#     val_texts=val_data['text'].tolist(),
#     val_labels=[(1 if t == 4 else 0) for t in val_data['target']],
#     batch_size=16,
#     epochs=5
# )

# # Final evaluation
# test_metrics = analyzer.evaluate(
#     test_data['text'].tolist(),
#     [(1 if t == 4 else 0) for t in test_data['target']]
# )
# print("\nTest Set Results:")
# print(f"Accuracy: {test_metrics['accuracy']:.4f}")
# print("\nClassification Report:")
# print(test_metrics['report'])

In [10]:
# # Save the model state dictionary
# torch.save(analyzer.model.state_dict(), 'sentiment_analyzer.pt')
# analyzer.tokenizer.save_pretrained('sentiment_tokenizer')

# Loading Roberta-Base model
def load_sentiment_model(model_path='sentiment_analyzer.pt', tokenizer_path='sentiment_tokenizer'):
    analyzer = SentimentAnalyzer()
    analyzer.model.load_state_dict(torch.load(model_path))
    analyzer.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return analyzer

In [39]:
# Evaluate Roberta-Base
class TranslationSentimentPipeline:
    def __init__(self, device=None):
        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.translation_models = {}
        self.analyzer = None
        
    def load_models(self, sentiment_path='sentiment_analyzer.pt', tokenizer_path='sentiment_tokenizer'):
        # Load sentiment analyzer using the loader function
        self.analyzer = load_sentiment_model(sentiment_path, tokenizer_path)
        self.analyzer.model = self.analyzer.model.to(self.device)
        self.analyzer.model.eval()
        
        # Load translation models
        for lang in ['fr', 'de', 'ro']:
            model = T5ForConditionalGeneration.from_pretrained(f"./T5-{lang}-en").to(self.device)
            tokenizer = T5Tokenizer.from_pretrained(f"./T5-{lang}-en")
            model.eval()
            self.translation_models[lang] = (model, tokenizer)
    
    def translate_batch(self, texts, lang_code, batch_size=16):
        model, tokenizer = self.translation_models[lang_code]
        translations = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Translating {lang_code}"):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(**inputs)
                decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                translations.extend(decoded)
        
        return translations
    
    def predict_sentiment(self, texts, batch_size=16):
        return self.analyzer.predict(texts, batch_size)
    
    def evaluate_pipeline(self, df_original, df_translated, lang_code):
        results = {}

        print(f"\nEvaluating English texts:")
        eng_preds = self.predict_sentiment(df_original['text'].tolist())
        eng_report = classification_report(
            [(1 if t == 4 else 0) for t in df_original['polarity']],
            eng_preds, 
            output_dict=True
        )
        results['english'] = eng_report

        print(f"\nEvaluating {lang_code} translations:")
        trans_preds = self.predict_sentiment(df_translated['text'].tolist())
        trans_report = classification_report(
            [(1 if t == 4 else 0) for t in df_translated['polarity']],
            trans_preds, 
            output_dict=True
        )
        results[lang_code] = trans_report

        return results

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
eng_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_eng.csv')
fr_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_french.csv')
ro_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_romanian.csv')
de_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_german.csv')

# Initialize and load models
pipeline = TranslationSentimentPipeline(device=device)
pipeline.load_models()

# Evaluate each language
for lang, data in [('fr', fr_data), ('de', de_data), ('ro', ro_data)]:
    print(f"\nEvaluating {lang.upper()} translations")
    results = pipeline.evaluate_pipeline(eng_data, data, lang)
    
    for lang_code, report in results.items():
        print(f"\nResults for {lang_code}:")
        print(f"Accuracy: {report['accuracy']:.4f}")
        print(f"Weighted F1: {report['weighted avg']['f1-score']:.4f}")

Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Evaluating FR translations

Evaluating English texts:

Evaluating fr translations:

Results for english:
Accuracy: 0.9133
Weighted F1: 0.9133

Results for fr:
Accuracy: 0.6433
Weighted F1: 0.6429

Evaluating DE translations

Evaluating English texts:

Evaluating de translations:

Results for english:
Accuracy: 0.9133
Weighted F1: 0.9133

Results for de:
Accuracy: 0.6800
Weighted F1: 0.6723

Evaluating RO translations

Evaluating English texts:

Evaluating ro translations:

Results for english:
Accuracy: 0.9133
Weighted F1: 0.9133

Results for ro:
Accuracy: 0.6633
Weighted F1: 0.6620


In [9]:
# Roberta-Sentiment Model
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class SentimentAnalyzer:
    def __init__(self, model_name="cardiffnlp/twitter-roberta-base-sentiment-latest", device=None):
        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # The model still expects 3 classes so we set the num_labels as 3
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=3
        ).to(self.device)

    def train(self, train_texts, train_labels, val_texts, val_labels, batch_size=16, epochs=3, learning_rate=1e-5):
        # Convert labels to 0 and 1 for binary classification
        train_labels = [0 if label == 0 else 1 for label in train_labels]
        val_labels = [0 if label == 0 else 1 for label in val_labels]
        
        train_dataset = SentimentDataset(train_texts, train_labels, self.tokenizer)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        val_dataset = SentimentDataset(val_texts, val_labels, self.tokenizer)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        # Calculate class weights for 2 classes
        class_counts = np.bincount(train_labels)
        class_weights = torch.FloatTensor(1.0 / class_counts).to(self.device)
        criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate, weight_decay=0.01)

        num_training_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_training_steps // 10,
            num_training_steps=num_training_steps
        )

        best_val_acc = 0
        best_model = None
        patience = 2  # For early stopping
        epochs_without_improvement = 0

        for epoch in range(epochs):
            self.model.train()
            total_loss = 0

            for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
                batch = {k: v.to(self.device) for k, v in batch.items()}

                optimizer.zero_grad()
                outputs = self.model(**batch)

                logits = outputs.logits
                labels = batch['labels']
                
                # Apply softmax to the logits and use only the probabilities for the positive and negative classes
                probs = torch.nn.functional.softmax(logits, dim=1)
                binary_probs = probs[:, [0, 2]]  # Assuming 0 is negative and 2 is positive
                
                loss = criterion(binary_probs, labels) # Calculate loss with binary probabilities

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

                total_loss += loss.item()

            # Validation
            val_metrics = self.evaluate(val_texts, val_labels, batch_size)
            val_acc = val_metrics["accuracy"]
            print(f'Epoch {epoch + 1} - Train Loss: {total_loss/len(train_loader):.4f}')
            print(f'Validation Accuracy: {val_acc:.4f}')
            print('Validation Report:')
            print(classification_report(
                val_labels,
                self.predict(val_texts, batch_size),
                labels=[0, 1],  # Binary classification
                target_names=['Negative', 'Positive']
            ))

            # Early stopping and model saving
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_model = copy.deepcopy(self.model.state_dict())
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1

            if epochs_without_improvement >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

        if best_model is not None:
            self.model.load_state_dict(best_model)

    def predict(self, texts, batch_size=32):
        self.model.eval()
        dataset = SentimentDataset(texts, [0] * len(texts), self.tokenizer)  # Dummy labels
        dataloader = DataLoader(dataset, batch_size=batch_size)

        predictions = []
        with torch.no_grad():
            for batch in dataloader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                logits = outputs.logits
                
                # Apply softmax and extract probabilities for the positive and negative classes
                probs = torch.nn.functional.softmax(logits, dim=1)
                binary_probs = probs[:, [0, 2]]  # Assuming 0 is negative and 2 is positive
                
                # Get predictions based on binary probabilities
                preds = torch.argmax(binary_probs, dim=1)
                
                predictions.extend(preds.cpu().numpy())

        return predictions

    def evaluate(self, texts, labels, batch_size=32):
        # Convert labels to 0 and 1 for evaluation
        labels = [0 if label == 0 else 1 for label in labels]

        predictions = self.predict(texts, batch_size)
        report = classification_report(labels, predictions, output_dict=True)
        return {
            "accuracy": report["accuracy"],
            "report": report
        }

# # Training 
# if __name__ == "__main__":
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     print(f"Using device: {device}")

#     # --- Rapid Experimentation with Pre-trained Sentiment Model ---

#     # 1. Prepare a smaller dataset (e.g., 20,000 samples) for quick testing
#     small_train_data, small_val_data, small_test_data = prepare_balanced_data(
#         'training.1600000.processed.noemoticon.csv',
#         total_samples=200000  # Reduce for faster experimentation
#     )

#     # 2. Instantiate SentimentAnalyzer with the pre-trained model
#     sentiment_analyzer = SentimentAnalyzer(
#         model_name="cardiffnlp/twitter-roberta-base-sentiment-latest",
#         device=device
#     )
    
#     # Convert labels to 0 and 1 in the small datasets
#     train_labels = [0 if label == 0 else 1 for label in small_train_data['target']]
#     val_labels = [0 if label == 0 else 1 for label in small_val_data['target']]
#     test_labels = [0 if label == 0 else 1 for label in small_test_data['target']]

#     # 3. Train for a few epochs (e.g., 2) with a small batch size
#     sentiment_analyzer.train(
#         train_texts=small_train_data['text'].tolist(),
#         train_labels=train_labels,
#         val_texts=small_val_data['text'].tolist(),
#         val_labels=val_labels,
#         batch_size=16,
#         epochs=5,  # Reduced for faster experimentation
#         learning_rate=1e-5
#     )

#     # 4. Evaluate on the small test set
#     test_metrics = sentiment_analyzer.evaluate(
#         small_test_data['text'].tolist(),
#         test_labels # Use converted labels for evaluation
#     )
#     print("\nTest Set Results (on small dataset):")
#     print(f"Accuracy: {test_metrics['accuracy']:.4f}")
#     print("\nClassification Report:")
#     print(test_metrics['report'])

#     # Save the model state dictionary and tokenizer
#     torch.save(sentiment_analyzer.model.state_dict(), 'sentiment_analyzer.pt')
#     sentiment_analyzer.tokenizer.save_pretrained('sentiment_tokenizer')

    # # --- (Optional) Full Training ---
    # # If the initial results are promising, you can train on the full dataset:
    # # train_data, val_data, test_data = prepare_balanced_data('training.1600000.processed.noemoticon.csv', total_samples=200000)
    # # sentiment_analyzer.train(...) # Train on the full data, potentially with more epochs

    # # --- Translation and Sentiment Pipeline ---

    # # Load data for translation pipeline
    # eng_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_eng.csv')
    # fr_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_french.csv')
    # ro_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_romanian.csv')
    # de_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_german.csv')

    # # Initialize and load models for the pipeline
    # pipeline = TranslationSentimentPipeline(device=device)
    # pipeline.load_models('sentiment_analyzer.pt', 'sentiment_tokenizer')

    # # Evaluate each language
    # for lang, data in [('fr', fr_data), ('de', de_data), ('ro', ro_data)]:
    #     print(f"\nEvaluating {lang.upper()} translations")
    #     results = pipeline.evaluate_pipeline(eng_data, data, lang)

    #     for lang_code, report in results.items():
    #         print(f"\nResults for {lang_code}:")
    #         print(f"Accuracy: {report['accuracy']:.4f}")
    #         print(f"Weighted F1: {report['weighted avg']['f1-score']:.4f}")

Using device: cuda
Loading data...
Class distribution in chunks:
First chunk: 0    200000
Name: target, dtype: int64
Second chunk: 4    800000
0    600000
Name: target, dtype: int64

Final dataset sizes:
Training: 140000 ({0: 70000, 4: 70000})
Validation: 30000 ({0: 15000, 4: 15000})
Test: 30000 ({0: 15000, 4: 15000})


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/5:   0%|          | 0/8750 [00:00<?, ?it/s]

Epoch 1 - Train Loss: 0.4599
Validation Accuracy: 0.8739
Validation Report:
              precision    recall  f1-score   support

    Negative       0.88      0.87      0.87     15000
    Positive       0.87      0.88      0.87     15000

    accuracy                           0.87     30000
   macro avg       0.87      0.87      0.87     30000
weighted avg       0.87      0.87      0.87     30000



Epoch 2/5:   0%|          | 0/8750 [00:00<?, ?it/s]

Epoch 2 - Train Loss: 0.4298
Validation Accuracy: 0.8769
Validation Report:
              precision    recall  f1-score   support

    Negative       0.89      0.86      0.87     15000
    Positive       0.86      0.89      0.88     15000

    accuracy                           0.88     30000
   macro avg       0.88      0.88      0.88     30000
weighted avg       0.88      0.88      0.88     30000



Epoch 3/5:   0%|          | 0/8750 [00:00<?, ?it/s]

Epoch 3 - Train Loss: 0.4148
Validation Accuracy: 0.8792
Validation Report:
              precision    recall  f1-score   support

    Negative       0.88      0.88      0.88     15000
    Positive       0.88      0.88      0.88     15000

    accuracy                           0.88     30000
   macro avg       0.88      0.88      0.88     30000
weighted avg       0.88      0.88      0.88     30000



Epoch 4/5:   0%|          | 0/8750 [00:00<?, ?it/s]

Epoch 4 - Train Loss: 0.4033
Validation Accuracy: 0.8832
Validation Report:
              precision    recall  f1-score   support

    Negative       0.88      0.88      0.88     15000
    Positive       0.88      0.88      0.88     15000

    accuracy                           0.88     30000
   macro avg       0.88      0.88      0.88     30000
weighted avg       0.88      0.88      0.88     30000



Epoch 5/5:   0%|          | 0/8750 [00:00<?, ?it/s]

Epoch 5 - Train Loss: 0.3931
Validation Accuracy: 0.8847
Validation Report:
              precision    recall  f1-score   support

    Negative       0.88      0.89      0.89     15000
    Positive       0.89      0.88      0.88     15000

    accuracy                           0.88     30000
   macro avg       0.88      0.88      0.88     30000
weighted avg       0.88      0.88      0.88     30000


Test Set Results (on small dataset):
Accuracy: 0.8878

Classification Report:
{'0': {'precision': 0.8845187731359069, 'recall': 0.8920666666666667, 'f1-score': 0.8882766861391396, 'support': 15000}, '1': {'precision': 0.8911377084454007, 'recall': 0.8835333333333333, 'f1-score': 0.8873192287091591, 'support': 15000}, 'accuracy': 0.8878, 'macro avg': {'precision': 0.8878282407906538, 'recall': 0.8877999999999999, 'f1-score': 0.8877979574241494, 'support': 30000}, 'weighted avg': {'precision': 0.8878282407906538, 'recall': 0.8878, 'f1-score': 0.8877979574241494, 'support': 30000}}


In [20]:
# Evaluate Roberta-Sentiment
class TranslationSentimentPipeline:
    def __init__(self, device=None):
        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.translation_models = {}
        self.analyzer = None
        
    def load_models(self, sentiment_path='sentiment_analyzer.pt', tokenizer_path='sentiment_tokenizer', translation_model='t5'):
        self.analyzer = load_sentiment_model(sentiment_path, tokenizer_path)
        self.analyzer.model = self.analyzer.model.to(self.device)
        self.analyzer.model.eval()
        
        self.translation_model = translation_model
        
        for lang in ['fr', 'de', 'ro']:
            if self.translation_model == 't5':
                model = T5ForConditionalGeneration.from_pretrained(f"T5models/T5-{lang}-en").to(self.device)
                tokenizer = T5Tokenizer.from_pretrained(f"T5models/T5-{lang}-en")
            else:
                model = MarianMTModel.from_pretrained(f'./marianmtmodels/marian-mt-{lang}-finetuned').to(self.device)
                tokenizer = MarianTokenizer.from_pretrained(f'./marianmtmodels/marian-mt-{lang}-finetuned')
            model.eval()
            self.translation_models[lang] = (model, tokenizer)
    
    def translate_batch(self, texts, lang_code, batch_size=8):
        model, tokenizer = self.translation_models[lang_code]
        translations = []
        
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=64)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=64, num_beams=2)
                decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                translations.extend(decoded)
                torch.cuda.empty_cache()
        
        return translations
    
    def predict_sentiment(self, texts, batch_size=8):
        predictions = []
        
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i+batch_size]
            inputs = self.analyzer.tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=64)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = self.analyzer.model(**inputs)
                preds = torch.argmax(outputs.logits, dim=1)
                predictions.extend(preds.tolist())
                torch.cuda.empty_cache()
        
        return predictions
    
    def evaluate_pipeline(self, df_translated, lang_code):
        translations = self.translate_batch(df_translated['text'].tolist(), lang_code)
        trans_preds = self.predict_sentiment(translations)

        trans_report = classification_report(
            [(1 if t == 4 else 0) for t in df_translated['polarity']],  # Changed from 'target' to 'polarity'
            trans_preds, 
            output_dict=True
        )

        return trans_report
    
# Load data 
eng_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_eng.csv')
fr_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_french.csv') 
de_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_german.csv')
ro_data = pd.read_csv('./sentiment_analysis_languages/sentiment140_romanian.csv')

# Initialize pipeline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline = TranslationSentimentPipeline(device=device)
pipeline.load_models()

# Evaluate each language
for lang, data in [('fr', fr_data), ('de', de_data), ('ro', ro_data)]:
    print(f"\nEvaluating {lang.upper()} translations")
    report = pipeline.evaluate_pipeline(data, lang)
    
    print(f"\nResults for {lang}:")
    print(f"Accuracy: {report['accuracy']:.4f}") 
    print(f"Weighted F1: {report['weighted avg']['f1-score']:.4f}")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary,


Evaluating FR translations


  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]


Results for fr:
Accuracy: 0.3600
Weighted F1: 0.3774

Evaluating DE translations


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]


Results for de:
Accuracy: 0.3633
Weighted F1: 0.3744

Evaluating RO translations


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]


Results for ro:
Accuracy: 0.3233
Weighted F1: 0.3366


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('roberta-base')
model.load_state_dict(torch.load('sentiment_analyzer.pt'))

print(model.config.architectures)
print(f"Number of labels: {model.num_labels}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['RobertaForMaskedLM']
Number of labels: 2
