In [2]:
import pandas as pd
import numpy as np
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import os
import json

In [4]:
# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [6]:
class DataProcessor:
    """
    Class for data collection and preprocessing for multiple NLP tasks
    """
    def __init__(self, cache_dir='./data'):
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)

    def load_classification_data(self, dataset_name='imdb', split=['train', 'test']):
        """
        Load and preprocess data for text classification task
        """
        try:
            print(f"Loading classification dataset: {dataset_name}")

            # Load dataset using Hugging Face datasets
            dataset = load_dataset(dataset_name)

            # Extract data
            if dataset_name == 'imdb':
                # IMDB dataset for sentiment analysis
                train_data = dataset["train"]
                test_data = dataset["test"]

                # Check if data is empty
                if not train_data or not test_data:
                    raise ValueError("Loaded dataset is empty.")

                train_df = pd.DataFrame({
                    'text': train_data['text'],
                    'label': train_data['label']
                })

                test_df = pd.DataFrame({
                    'text': test_data['text'],
                    'label': test_data['label']
                })

                # Map numeric labels to string labels for clarity
                train_df['sentiment'] = train_df['label'].map({0: 'negative', 1: 'positive'})
                test_df['sentiment'] = test_df['label'].map({0: 'negative', 1: 'positive'})

                # Prepare train/val/test split
                val_df = train_df.sample(frac=0.1, random_state=42)
                train_df = train_df.drop(val_df.index)

                # Save processed data
                train_df.to_csv(f"{self.cache_dir}/classification_train.csv", index=False)
                val_df.to_csv(f"{self.cache_dir}/classification_val.csv", index=False)
                test_df.to_csv(f"{self.cache_dir}/classification_test.csv", index=False)

                print(f"Classification data saved. Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

                return {
                    'train': train_df,
                    'val': val_df,
                    'test': test_df,
                    'label_map': {0: 'negative', 1: 'positive'},
                    'num_labels': 2
                }
        except Exception as e:
            print(f"Error loading classification dataset: {e}")
            return None

    def load_summarization_data(self, dataset_name='cnn_dailymail', version='3.0.0', split=['train', 'validation', 'test']):
        """
        Load and preprocess data for text summarization task
        """
        try:
            print(f"Loading summarization dataset: {dataset_name}")

            # Load dataset using Hugging Face datasets
            dataset = load_dataset(dataset_name, version)

            # Extract samples from each split
            train_samples = dataset['train'].select(range(min(1000, len(dataset['train']))))
            val_samples = dataset['validation'].select(range(min(200, len(dataset['validation']))))
            test_samples = dataset['test'].select(range(min(200, len(dataset['test']))))

            # Convert to DataFrames
            train_df = pd.DataFrame({
                'document': train_samples['article'],
                'summary': train_samples['highlights']
            })

            val_df = pd.DataFrame({
                'document': val_samples['article'],
                'summary': val_samples['highlights']
            })

            test_df = pd.DataFrame({
                'document': test_samples['article'],
                'summary': test_samples['highlights']
            })

            # Save processed data
            train_df.to_csv(f"{self.cache_dir}/summarization_train.csv", index=False)
            val_df.to_csv(f"{self.cache_dir}/summarization_val.csv", index=False)
            test_df.to_csv(f"{self.cache_dir}/summarization_test.csv", index=False)

            print(f"Summarization data saved. Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

            return {
                'train': train_df,
                'val': val_df,
                'test': test_df
            }
        except Exception as e:
            print(f"Error loading summarization dataset: {e}")
            return None

    def load_translation_data(self, dataset_name='wmt16', language_pair='de-en', split=['train', 'validation', 'test']):
        """
        Load and preprocess data for machine translation task
        """
        print(f"Loading translation dataset: {dataset_name} ({language_pair})")

        # Load dataset using Hugging Face datasets
        try:
            dataset = load_dataset(dataset_name, language_pair)

            # Extract language codes
            source_lang, target_lang = language_pair.split('-')

            # Extract samples from each split (limiting size to make it manageable)
            train_samples = dataset['train'].select(range(min(1000, len(dataset['train']))))
            val_samples = dataset['validation'].select(range(min(200, len(dataset['validation']))))
            test_samples = dataset['test'].select(range(min(200, len(dataset['test']))))

            # Convert to DataFrames
            train_df = pd.DataFrame({
                'source_text': [item['translation'][source_lang] for item in train_samples],
                'target_text': [item['translation'][target_lang] for item in train_samples],
                'source_lang': source_lang,
                'target_lang': target_lang
            })

            val_df = pd.DataFrame({
                'source_text': [item['translation'][source_lang] for item in val_samples],
                'target_text': [item['translation'][target_lang] for item in val_samples],
                'source_lang': source_lang,
                'target_lang': target_lang
            })

            test_df = pd.DataFrame({
                'source_text': [item['translation'][source_lang] for item in test_samples],
                'target_text': [item['translation'][target_lang] for item in test_samples],
                'source_lang': source_lang,
                'target_lang': target_lang
            })

            # Save processed data
            train_df.to_csv(f"{self.cache_dir}/translation_train.csv", index=False)
            val_df.to_csv(f"{self.cache_dir}/translation_val.csv", index=False)
            test_df.to_csv(f"{self.cache_dir}/translation_test.csv", index=False)

            print(f"Translation data saved. Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

            return {
                'train': train_df,
                'val': val_df,
                'test': test_df,
                'source_lang': source_lang,
                'target_lang': target_lang
            }

        except Exception as e:
            print(f"Error loading translation dataset: {e}")
            # Create a small demo dataset if loading fails
            print("Creating a small demo translation dataset instead")

            # Demo German-English pairs
            de_en_pairs = [
                ("Hallo, wie geht es dir?", "Hello, how are you?"),
                ("Ich liebe Programmierung.", "I love programming."),
                ("Das Wetter ist heute schön.", "The weather is nice today."),
                ("Können Sie mir bitte helfen?", "Can you please help me?"),
                ("Vielen Dank für Ihre Hilfe.", "Thank you very much for your help.")
            ]

            # Split into train/val/test
            train_pairs = de_en_pairs[:3]
            val_pairs = de_en_pairs[3:4]
            test_pairs = de_en_pairs[4:]

            # Create DataFrames
            train_df = pd.DataFrame({
                'source_text': [pair[0] for pair in train_pairs],
                'target_text': [pair[1] for pair in train_pairs],
                'source_lang': 'de',
                'target_lang': 'en'
            })

            val_df = pd.DataFrame({
                'source_text': [pair[0] for pair in val_pairs],
                'target_text': [pair[1] for pair in val_pairs],
                'source_lang': 'de',
                'target_lang': 'en'
            })

            test_df = pd.DataFrame({
                'source_text': [pair[0] for pair in test_pairs],
                'target_text': [pair[1] for pair in test_pairs],
                'source_lang': 'de',
                'target_lang': 'en'
            })

            # Save processed data
            train_df.to_csv(f"{self.cache_dir}/translation_train.csv", index=False)
            val_df.to_csv(f"{self.cache_dir}/translation_val.csv", index=False)
            test_df.to_csv(f"{self.cache_dir}/translation_test.csv", index=False)

            print(f"Demo translation data saved. Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

            return {
                'train': train_df,
                'val': val_df,
                'test': test_df,
                'source_lang': 'de',
                'target_lang': 'en'
            }

    def setup_text_generation_data(self):
        """
        For text generation, we'll use a prompt-based approach
        This creates a small set of example prompts
        """
        print("Setting up text generation examples")

        examples = [
            {"prompt": "Write a short story about a robot learning to paint", "category": "creative"},
            {"prompt": "Explain quantum computing to a 10-year-old", "category": "educational"},
            {"prompt": "Write a professional email requesting a meeting", "category": "business"},
            {"prompt": "Create a recipe for chocolate chip cookies", "category": "cooking"},
            {"prompt": "Write a product description for a new smartphone", "category": "marketing"}
        ]

        # Save example prompts
        df = pd.DataFrame(examples)
        df.to_csv(f"{self.cache_dir}/generation_examples.csv", index=False)

        print(f"Text generation examples saved: {len(df)} examples")

        return df

    def preprocess_text(self, text, task='classification'):
        """
        Preprocess text based on the target task
        """
        if task == 'classification':
            # Convert to lowercase
            text = str(text).lower()

            # Remove special characters, numbers, and extra whitespace
            text = re.sub(r'[^a-zA-Z\s]', '', text)
            text = re.sub(r'\s+', ' ', text).strip()

            # Tokenize and remove stopwords
            stop_words = set(stopwords.words('english'))
            tokens = word_tokenize(text)
            tokens = [word for word in tokens if word not in stop_words]

            # Rejoin into a single string
            processed_text = ' '.join(tokens)

            return processed_text

        elif task == 'summarization':
            # For summarization, we need to preserve more of the original structure
            # Just clean up and normalize
            text = str(text).strip()
            text = re.sub(r'\s+', ' ', text)
            return text

        elif task == 'translation':
            # For translation, minimal preprocessing to preserve meaning
            text = str(text).strip()
            text = re.sub(r'\s+', ' ', text)
            return text

        elif task == 'generation':
            # For generation, clean but keep structure
            text = str(text).strip()
            return text

        else:
            raise ValueError(f"Unknown task: {task}")

In [8]:
if __name__ == "__main__":
    # Example usage
    processor = DataProcessor()

    # Load datasets for each task
    classification_data = processor.load_classification_data()
    summarization_data = processor.load_summarization_data()
    translation_data = processor.load_translation_data()
    generation_examples = processor.setup_text_generation_data()

    print("Data collection and preprocessing complete for all tasks!")

Loading classification dataset: imdb
Classification data saved. Train: 22500, Val: 2500, Test: 25000
Loading summarization dataset: cnn_dailymail
Summarization data saved. Train: 1000, Val: 200, Test: 200
Loading translation dataset: wmt16 (de-en)
Translation data saved. Train: 1000, Val: 200, Test: 200
Setting up text generation examples
Text generation examples saved: 5 examples
Data collection and preprocessing complete for all tasks!
