In [None]:
# Generate the translation samples for training 
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from tqdm.auto import tqdm
import torch
import os

# --- Configuration ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Translation Function ---

def translate_text(text, model, tokenizer, max_length=128):
    """Translates text using the specified MarianMT model and tokenizer."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to GPU

    with torch.no_grad():
        translated = model.generate(**inputs)

    decoded = tokenizer.decode(translated[0], skip_special_tokens=True)
    return decoded

# --- Main Execution ---

if __name__ == "__main__":
    # --- Create the output directory if it doesn't exist ---
    output_dir = "./improved_datasets"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # --- Load Sentiment140 Dataset ---
    print("Loading Sentiment140 dataset...")
    data_path = "training.1600000.processed.noemoticon.csv"  # Replace with path
    columns = ['target', 'id', 'date', 'flag', 'user', 'text']
    df = pd.read_csv(data_path, encoding='latin-1', names=columns)

    # --- Translation ---
    target_languages = {
        'fr': 'Helsinki-NLP/opus-mt-en-fr',  # French
        'de': 'Helsinki-NLP/opus-mt-en-de',  # German
        'ro': 'Helsinki-NLP/opus-mt-en-ro',   # Romanian
    }

    batch_size = 16  # Adjust based on GPU memory
    num_tweets_to_translate = 3000  # Set the number of tweets to translate per language
    num_positive = num_tweets_to_translate // 2  # Number of positive tweets to translate
    num_negative = num_tweets_to_translate - num_positive  # Number of negative tweets to translate

    for lang_code, model_name in target_languages.items():
        print(f"\nTranslating to {lang_code} using {model_name}...")

        # Load model and tokenizer
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name).to(device)
        model.eval()  # Set the model to evaluation mode

        # --- Get balanced sample of positive and negative tweets ---
        df_positive = df[df['target'] == 4].sample(n=num_positive, random_state=42)  # 4 represents positive sentiment
        df_negative = df[df['target'] == 0].sample(n=num_negative, random_state=42)  # 0 represents negative sentiment
        df_sample = pd.concat([df_positive, df_negative])

        translated_texts = []

        # Limit the loop to process only the desired number of tweets
        for i in tqdm(range(0, len(df_sample), batch_size)):
            batch = df_sample['text'][i:i + batch_size].tolist()

            # Translate the batch of texts
            batch_translations = [translate_text(text, model, tokenizer) for text in batch]
            translated_texts.extend(batch_translations)

        # Create a new DataFrame for this language
        df_lang = df_sample.copy()
        df_lang[f'text_{lang_code}'] = translated_texts

        # Save the DataFrame with translations to a new CSV file
        output_file = f"sentiment140_translated_{lang_code}.csv"
        df_lang.to_csv(f"./improved_datasets/{output_file}", index=False)
        print(f"Translations to {lang_code} saved to {output_file}")

    print("Translation complete!")

Loading Sentiment140 dataset...

Translating to fr using Helsinki-NLP/opus-mt-en-fr...




  0%|          | 0/188 [00:00<?, ?it/s]

Translations to fr saved to sentiment140_translated_fr.csv

Translating to de using Helsinki-NLP/opus-mt-en-de...




  0%|          | 0/188 [00:00<?, ?it/s]

Translations to de saved to sentiment140_translated_de.csv

Translating to ro using Helsinki-NLP/opus-mt-en-ro...




  0%|          | 0/188 [00:00<?, ?it/s]

Translations to ro saved to sentiment140_translated_ro.csv
Translation complete!


In [4]:
# Generate the translation samples using the OpenAI API for more accurate translations
import openai
import pandas as pd
from tqdm import tqdm
import time


def translate_with_openai(texts, target_lang, batch_size=20):
    translations = []
    system_prompt = f"You are a professional translator. Translate the following English texts into {target_lang}. Return only the translations, one per line."

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        user_prompt = "\n".join(batch)

        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.0
            )
            batch_translations = response.choices[0].message.content.strip().split('\n')
            batch_translations = [t.strip() for t in batch_translations if t.strip()]
            batch_translations = batch_translations[:len(batch)]
            while len(batch_translations) < len(batch):
                batch_translations.append("")

            translations.extend(batch_translations)

        except Exception as e:
            print(f"Error: {e}")
            translations.extend([""] * len(batch))

        time.sleep(0.1)  # To avoid rate limit issues

    return translations


def main():
    # Load your dataset
    data = pd.read_csv(
        'training.1600000.processed.noemoticon.csv', 
        encoding='latin-1',
        names=['target', 'id', 'date', 'flag', 'user', 'text']
    )

    # Filter out positive and negative samples only
    positive_data = data[data['target'] == 4]
    negative_data = data[data['target'] == 0]

    # Sample equal numbers from positive and negative
    pos_sample = positive_data.sample(n=500, random_state=42)
    neg_sample = negative_data.sample(n=500, random_state=42)

    balanced_data = pd.concat([pos_sample, neg_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

    # Translate to multiple languages
    lang_map = {
        'fr': 'French',
        'de': 'German',
        'ro': 'Romanian'
    }

    for lang_code, lang_name in lang_map.items():
        translations = translate_with_openai(
            balanced_data['text'].tolist(),
            lang_name
        )

        # Save translations
        pd.DataFrame({
            'text': balanced_data['text'],
            'translation': translations,
            'target': balanced_data['target']
        }).to_csv(f'openai_translations_{lang_code}.csv', index=False)

if __name__ == "__main__":
    main()


100%|██████████| 50/50 [04:46<00:00,  5.73s/it]
100%|██████████| 50/50 [04:09<00:00,  4.99s/it]
100%|██████████| 50/50 [05:01<00:00,  6.02s/it]


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MarianMTModel, MarianTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
from tqdm.auto import tqdm
import sacrebleu

class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_length=128):
        self.source_encodings = tokenizer(source_texts, truncation=True, padding=True, max_length=max_length)
        self.target_encodings = tokenizer(target_texts, truncation=True, padding=True, max_length=max_length)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.source_encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.source_encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.target_encodings['input_ids'][idx])
        }

    def __len__(self):
        return len(self.source_encodings['input_ids'])

def train_translator(lang_code, train_data, val_data=None, batch_size=8, epochs=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = f'Helsinki-NLP/opus-mt-en-{lang_code}'
    
    # Load model and tokenizer
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    
    # Prepare datasets
    train_dataset = TranslationDataset(
        train_data['text'].tolist(),
        train_data['translation'].tolist(),
        tokenizer
    )
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    if val_data is not None:
        val_dataset = TranslationDataset(
            val_data['text'].tolist(),
            val_data['translation'].tolist(),
            tokenizer
        )
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Training setup
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=total_steps // 10,
        num_training_steps=total_steps
    )
    
    best_loss = float('inf')
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}')
        
        if avg_loss < best_loss:
            best_loss = avg_loss
            model.save_pretrained(f'./marianmtmodels/marian-mt-{lang_code}-finetuned')
            tokenizer.save_pretrained(f'./marianmtmodels/marian-mt-{lang_code}-finetuned')

def main():
    data = pd.read_csv(
        'training.1600000.processed.noemoticon.csv', 
        encoding='latin-1',
        names=['target', 'id', 'date', 'flag', 'user', 'text']
    )

    positive_data = data[data['target'] == 4]
    negative_data = data[data['target'] == 0]

    pos_sample = positive_data.sample(n=500, random_state=42)
    neg_sample = negative_data.sample(n=500, random_state=42)

    balanced_data = pd.concat([pos_sample, neg_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

    # Ensure your CSV with translations has the columns 'text' and 'translation'
    # and that 'translation' column actually contains strings.
    for lang_code in ['fr', 'de', 'ro']:
        df = pd.read_csv(f'./improved_datasets/openai_translations_{lang_code}.csv')

        # Clean data: Ensure strings and drop NaNs
        df['text'] = df['text'].astype(str)
        df['translation'] = df['translation'].astype(str)
        df = df.dropna(subset=['text', 'translation'])

        train_size = int(0.9 * len(df))
        train_data = df[:train_size]
        val_data = df[train_size:]

        # Convert columns to string just to be safe
        train_data['text'] = train_data['text'].astype(str)
        train_data['translation'] = train_data['translation'].astype(str)

        val_data['text'] = val_data['text'].astype(str)
        val_data['translation'] = val_data['translation'].astype(str)

        train_translator(lang_code, train_data, val_data)

main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['text'] = train_data['text'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['translation'] = train_data['translation'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['text'] = val_data['text'].astype(str)
A value is trying to be set on a copy o

Epoch 1/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 1 - Average Loss: 2.0298


Epoch 2/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 2 - Average Loss: 1.0142


Epoch 3/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 3 - Average Loss: 0.8393


Epoch 4/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 4 - Average Loss: 0.7484


Epoch 5/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 5 - Average Loss: 0.7045


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['text'] = train_data['text'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['translation'] = train_data['translation'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['text'] = val_data['text'].astype(str)
A value is trying to be set on a copy o

Epoch 1/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 1 - Average Loss: 2.5345


Epoch 2/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 2 - Average Loss: 1.2689


Epoch 3/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 3 - Average Loss: 1.0368


Epoch 4/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 4 - Average Loss: 0.9256


Epoch 5/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 5 - Average Loss: 0.8699


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['text'] = train_data['text'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['translation'] = train_data['translation'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['text'] = val_data['text'].astype(str)
A value is trying to be set on a copy o

Epoch 1/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 1 - Average Loss: 2.3184


Epoch 2/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 2 - Average Loss: 0.7894


Epoch 3/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 3 - Average Loss: 0.6396


Epoch 4/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 4 - Average Loss: 0.5631


Epoch 5/5:   0%|          | 0/113 [00:00<?, ?it/s]

Epoch 5 - Average Loss: 0.5315
