In [23]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    #AutoModelForSeq2SeqLM & AutoModelForCausalLM for text generation
    AutoModelForCausalLM,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup,
    pipeline
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import json
import time

In [25]:
# Set seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [27]:
class TextClassificationDataset(Dataset):
    """Dataset for text classification tasks"""
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [29]:
class TextSeq2SeqDataset(Dataset):
    """Dataset for sequence-to-sequence tasks (summarization, translation)"""
    def __init__(self, source_texts, target_texts, tokenizer, max_source_len=512, max_target_len=128):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_source_len = max_source_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source_text = str(self.source_texts[idx])
        target_text = str(self.target_texts[idx])

        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_source_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_target_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = source_encoding['input_ids'].flatten()
        attention_mask = source_encoding['attention_mask'].flatten()
        labels = target_encoding['input_ids'].flatten()

        # Replace padding token id's with -100 so they are ignored in the loss
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [31]:
class ModelTrainer:
    """Class to handle model training for different NLP tasks"""
    def __init__(self, models_dir='./models'):
        self.models_dir = models_dir
        os.makedirs(models_dir, exist_ok=True)

    def train_classification_model(self, train_df, val_df, num_labels=2, model_name='distilbert-base-uncased',
                                   epochs=3, batch_size=16, learning_rate=2e-5):
        """Train a text classification model"""
        print(f"Training classification model: {model_name}")
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {device}")

        # Initialize tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )
        model.to(device)

        # Create data loaders
        train_dataset = TextClassificationDataset(
            texts=train_df['text'].tolist(),
            labels=train_df['label'].tolist(),
            tokenizer=tokenizer
        )

        val_dataset = TextClassificationDataset(
            texts=val_df['text'].tolist(),
            labels=val_df['label'].tolist(),
            tokenizer=tokenizer
        )

        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True
        )

        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size
        )

        # Prepare optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=learning_rate)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        # Training loop
        best_val_accuracy = 0
        train_losses = []
        val_metrics = []

        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}")

            # Training
            model.train()
            train_loss = 0
            progress_bar = tqdm(train_loader, desc="Training")

            for batch in progress_bar:
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                train_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

                progress_bar.set_postfix({'loss': loss.item()})

            avg_train_loss = train_loss / len(train_loader)
            train_losses.append(avg_train_loss)
            print(f"Average training loss: {avg_train_loss:.4f}")

            # Validation
            model.eval()
            val_loss = 0
            predictions = []
            actual_labels = []

            with torch.no_grad():
                for batch in tqdm(val_loader, desc="Validation"):
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['label'].to(device)

                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )

                    loss = outputs.loss
                    val_loss += loss.item()

                    _, preds = torch.max(outputs.logits, dim=1)

                    predictions.extend(preds.cpu().tolist())
                    actual_labels.extend(labels.cpu().tolist())

            avg_val_loss = val_loss / len(val_loader)
            val_accuracy = accuracy_score(actual_labels, predictions)
            precision, recall, f1, _ = precision_recall_fscore_support(
                actual_labels, predictions, average='weighted'
            )

            val_metrics.append({
                'loss': avg_val_loss,
                'accuracy': val_accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            })

            print(f"Validation Loss: {avg_val_loss:.4f}")
            print(f"Accuracy: {val_accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1 Score: {f1:.4f}")

            # Save best model
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                print(f"Saving best model with accuracy: {best_val_accuracy:.4f}")

                # Create directory for model
                model_save_dir = f"{self.models_dir}/classification"
                os.makedirs(model_save_dir, exist_ok=True)

                # Save model and tokenizer
                model.save_pretrained(model_save_dir)
                tokenizer.save_pretrained(model_save_dir)

                # Save training metrics
                metrics = {
                    'train_losses': train_losses,
                    'val_metrics': val_metrics,
                    'best_val_accuracy': best_val_accuracy,
                    'epochs_trained': epoch + 1,
                    'model_name': model_name,
                    'num_labels': num_labels
                }

                with open(f"{model_save_dir}/training_metrics.json", 'w') as f:
                    json.dump(metrics, f)

        print(f"Classification model training complete. Best accuracy: {best_val_accuracy:.4f}")
        return model_save_dir

    def train_summarization_model(self, train_df, val_df, model_name='t5-small',
                                  epochs=3, batch_size=8, learning_rate=5e-5):
        """Train a text summarization model"""
        print(f"Training summarization model: {model_name}")
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {device}")

        # Initialize tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        # The original code used AutoModelForSeq2SeqLM, which is correct for summarization
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        model.to(device)

        # For T5, we need to add a prefix
        train_df['source_text'] = train_df['document'].apply(lambda x: f"summarize: {x}")
        val_df['source_text'] = val_df['document'].apply(lambda x: f"summarize: {x}")

        # Create data loaders
        train_dataset = TextSeq2SeqDataset(
            source_texts=train_df['source_text'].tolist(),
            target_texts=train_df['summary'].tolist(),
            tokenizer=tokenizer,
            max_source_len=512,
            max_target_len=128
        )

        val_dataset = TextSeq2SeqDataset(
            source_texts=val_df['source_text'].tolist(),
            target_texts=val_df['summary'].tolist(),
            tokenizer=tokenizer,
            max_source_len=512,
            max_target_len=128
        )

        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True
        )

        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size
        )

        # Prepare optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=learning_rate)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        # Training loop
        best_val_rouge = 0
        train_losses = []
        val_metrics = []

        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}")

            # Training
            model.train()
            train_loss = 0
            progress_bar = tqdm(train_loader, desc="Training")

            for batch in progress_bar:
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                train_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

                progress_bar.set_postfix({'loss': loss.item()})

            avg_train_loss = train_loss / len(train_loader)
            train_losses.append(avg_train_loss)
            print(f"Average training loss: {avg_train_loss:.4f}")

            # Validation - evaluate with ROUGE
            model.eval()
            val_loss = 0
            generated_summaries = []
            reference_summaries = []

            # Use a smaller subset for validation to save time
            max_val_samples = min(100, len(val_df))
            val_subset = val_df.iloc[:max_val_samples]

            with torch.no_grad():
                for i, row in tqdm(val_subset.iterrows(), total=len(val_subset), desc="Validation"):
                    document = row['source_text']
                    reference = row['summary']

                    # Tokenize
                    inputs = tokenizer(document, return_tensors="pt", max_length=512, truncation=True)
                    input_ids = inputs.input_ids.to(device)
                    attention_mask = inputs.attention_mask.to(device)

                    # Generate summary
                    summary_ids = model.generate(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        max_length=128,
                        min_length=30,
                        no_repeat_ngram_size=3,
                        early_stopping=True
                    )

                    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

                    generated_summaries.append(summary)
                    reference_summaries.append(reference)

            # Compute ROUGE scores
            rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
            rouge_scores = []

            for pred, ref in zip(generated_summaries, reference_summaries):
                score = rouge.score(pred, ref)
                rouge_scores.append(score)

            # Calculate average ROUGE scores
            avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
            avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
            avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])
            avg_rouge = (avg_rouge1 + avg_rouge2 + avg_rougeL) / 3

            val_metric = {
                'rouge1': avg_rouge1,
                'rouge2': avg_rouge2,
                'rougeL': avg_rougeL,
                'avg_rouge': avg_rouge
            }

            val_metrics.append(val_metric)

            print(f"ROUGE-1: {avg_rouge1:.4f}")
            print(f"ROUGE-2: {avg_rouge2:.4f}")
            print(f"ROUGE-L: {avg_rougeL:.4f}")
            print(f"Avg ROUGE: {avg_rouge:.4f}")

            # Save best model based on average ROUGE score
            if avg_rouge > best_val_rouge:
                best_val_rouge = avg_rouge
                print(f"Saving best model with avg ROUGE: {best_val_rouge:.4f}")

                # Create directory for model
                model_save_dir = f"{self.models_dir}/summarization"
                os.makedirs(model_save_dir, exist_ok=True)

                # Save model and tokenizer
                model.save_pretrained(model_save_dir)
                tokenizer.save_pretrained(model_save_dir)

                # Save training metrics
                metrics = {
                    'train_losses': train_losses,
                    'val_metrics': val_metrics,
                    'best_val_rouge': best_val_rouge,
                    'epochs_trained': epoch + 1,
                    'model_name': model_name
                }

                with open(f"{model_save_dir}/training_metrics.json", 'w') as f:
                    json.dump(metrics, f)

        print(f"Summarization model training complete. Best avg ROUGE: {best_val_rouge:.4f}")
        return model_save_dir

    def train_translation_model(self, train_df, val_df, model_name='t5-small',
                                source_lang='de', target_lang='en',
                                epochs=3, batch_size=8, learning_rate=5e-5):
        """Train a machine translation model"""
        print(f"Training translation model: {model_name} ({source_lang} to {target_lang})")
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {device}")

        # Initialize tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(model_name)
        model.to(device)

        # Prepare text with language prefix for T5
        train_df['source_text'] = train_df.apply(
            lambda row: f"translate {source_lang} to {target_lang}: {row['source_text']}", axis=1)
        val_df['source_text'] = val_df.apply(
            lambda row: f"translate {source_lang} to {target_lang}: {row['source_text']}", axis=1)

        # Create data loaders
        train_dataset = TextSeq2SeqDataset(
            source_texts=train_df['source_text'].tolist(),
            target_texts=train_df['target_text'].tolist(),
            tokenizer=tokenizer,
            max_source_len=128,
            max_target_len=128
        )

        val_dataset = TextSeq2SeqDataset(
            source_texts=val_df['source_text'].tolist(),
            target_texts=val_df['target_text'].tolist(),
            tokenizer=tokenizer,
            max_source_len=128,
            max_target_len=128
        )

        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True
        )

        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size
        )

        # Prepare optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=learning_rate)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        # Training loop
        best_val_bleu = 0
        train_losses = []
        val_metrics = []

        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}")

            # Training
            model.train()
            train_loss = 0
            progress_bar = tqdm(train_loader, desc="Training")

            for batch in progress_bar:
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                train_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

                progress_bar.set_postfix({'loss': loss.item()})

            avg_train_loss = train_loss / len(train_loader)
            train_losses.append(avg_train_loss)
            print(f"Average training loss: {avg_train_loss:.4f}")

            # Validation - evaluate with BLEU score
            model.eval()
            translations = []
            references = []

            # Use a smaller subset for validation to save time
            max_val_samples = min(100, len(val_df))
            val_subset = val_df.iloc[:max_val_samples]

            smooth = SmoothingFunction().method1

            with torch.no_grad():
                for i, row in tqdm(val_subset.iterrows(), total=len(val_subset), desc="Validation"):
                    source_text = row['source_text']
                    target_text = row['target_text']

                    # Tokenize
                    inputs = tokenizer(source_text, return_tensors="pt", max_length=128, truncation=True)
                    input_ids = inputs.input_ids.to(device)
                    attention_mask = inputs.attention_mask.to(device)

                    # Generate translation
                    output_ids = model.generate(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        max_length=128,
                        num_beams=4,
                        early_stopping=True
                    )

                    translation = tokenizer.decode(output_ids[0], skip_special_tokens=True)

                    translations.append(translation)
                    references.append(target_text)

            # Compute BLEU scores
            bleu_scores = []

            for pred, ref in zip(translations, references):
                pred_tokens = pred.split()
                ref_tokens = [ref.split()]

                try:
                    score = sentence_bleu(
                        ref_tokens,
                        pred_tokens,
                        smoothing_function=smooth
                    )
                    bleu_scores.append(score)
                except Exception as e:
                    print(f"Error computing BLEU: {e}")
                    continue

            avg_bleu = np.mean(bleu_scores) if bleu_scores else 0

            val_metric = {
                'bleu': avg_bleu
            }

            val_metrics.append(val_metric)

            print(f"BLEU Score: {avg_bleu:.4f}")

            # Save best model based on BLEU score
            if avg_bleu > best_val_bleu:
                best_val_bleu = avg_bleu
                print(f"Saving best model with BLEU: {best_val_bleu:.4f}")

                # Create directory for model
                model_save_dir = f"{self.models_dir}/translation"
                os.makedirs(model_save_dir, exist_ok=True)

                # Save model and tokenizer
                model.save_pretrained(model_save_dir)
                tokenizer.save_pretrained(model_save_dir)

                # Save training metrics
                metrics = {
                    'train_losses': train_losses,
                    'val_metrics': val_metrics,
                    'best_val_bleu': best_val_bleu,
                    'epochs_trained': epoch + 1,
                    'model_name': model_name,
                    'source_lang': source_lang,
                    'target_lang': target_lang
                }

                with open(f"{model_save_dir}/training_metrics.json", 'w') as f:
                    json.dump(metrics, f)

        print(f"Translation model training complete. Best BLEU: {best_val_bleu:.4f}")
        return model_save_dir

    def setup_text_generation_model(self, model_name='gpt2', save_dir=None):
        """
        Set up a pre-trained model for text generation
        For generation, we'll use a pre-trained model directly
        """
        print(f"Setting up text generation model: {model_name}")

        # Initialize tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        # Use AutoModelForCausalLM for text generation models like GPT-2
        model = AutoModelForCausalLM.from_pretrained(model_name)

        # If save_dir is provided, save the model
        if save_dir:
            model_save_dir = f"{self.models_dir}/generation"
            os.makedirs(model_save_dir, exist_ok=True)

            model.save_pretrained(model_save_dir)
            tokenizer.save_pretrained(model_save_dir)

            # Save model info
            model_info = {
                'model_name': model_name
            }

            with open(f"{model_save_dir}/model_info.json", 'w') as f:
                json.dump(model_info, f)

            print(f"Text generation model saved to {model_save_dir}")
            return model_save_dir
        else:
            return model_name

In [41]:
if __name__ == "__main__":
    import argparse
    import sys

    # Check if running in Jupyter or similar environment
    if 'ipykernel' in sys.modules:
        # If running in Jupyter, use an empty list for args
        args = argparse.Namespace(task='all', data_dir='./data', models_dir='./models', epochs=3)
    else:
        parser = argparse.ArgumentParser(description='Train NLP models')
        parser.add_argument('--task', type=str, choices=['classification', 'summarization', 'translation', 'generation', 'all'],
                            default='all', help='NLP task to train model for')
        parser.add_argument('--data_dir', type=str, default='./data', help='Directory with preprocessed data')
        parser.add_argument('--models_dir', type=str, default='./models', help='Directory to save trained models')
        parser.add_argument('--epochs', type=int, default=3, help='Number of training epochs')

        args = parser.parse_args()

    trainer = ModelTrainer(models_dir=args.models_dir)

    # Load data based on task
    if args.task == 'classification' or args.task == 'all':
        print("\n=== Training Classification Model ===")
        train_df = pd.read_csv(f"{args.data_dir}/classification_train.csv")
        val_df = pd.read_csv(f"{args.data_dir}/classification_val.csv")
        trainer.train_classification_model(train_df, val_df, epochs=args.epochs)

    if args.task == 'summarization' or args.task == 'all':
        print("\n=== Training Summarization Model ===")
        train_df = pd.read_csv(f"{args.data_dir}/summarization_train.csv")
        val_df = pd.read_csv(f"{args.data_dir}/summarization_val.csv")
        trainer.train_summarization_model(train_df, val_df, epochs=args.epochs)

    if args.task == 'translation' or args.task == 'all':
        print("\n=== Training Translation Model ===")
        train_df = pd.read_csv(f"{args.data_dir}/translation_train.csv")
        val_df = pd.read_csv(f"{args.data_dir}/translation_val.csv")
        trainer.train_translation_model(
            train_df, val_df,
            source_lang=train_df['source_lang'].iloc[0],
            target_lang=train_df['target_lang'].iloc[0],
            epochs=args.epochs
        )

    if args.task == 'generation' or args.task == 'all': # Corrected the condition here
        print("\n=== Setting Up Text Generation Model ===")
        trainer.setup_text_generation_model(save_dir=args.models_dir)

    print("\nTraining complete! Models saved to", args.models_dir)


=== Setting Up Text Generation Model ===
Setting up text generation model: gpt2


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForSeq2SeqLM.
Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, Qwen2AudioConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig.