In [2]:
import pandas as pd

In [3]:
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from tqdm.auto import tqdm
import re
from collections import Counter
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
nltk_resources = ['punkt', 'averaged_perceptron_tagger', 'wordnet', 'omw-1.4']
for resource in nltk_resources:
    try:
        nltk.data.find(f'tokenizers/{resource}')
    except LookupError:
        nltk.download(resource)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\umair\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
class TextPreprocessor:
    def __init__(self, max_length=512, model_name="gpt2"):
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token  # Set padding token
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        if pd.isna(text):
            return ""
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

        # Remove email addresses
        text = re.sub(r'[\w\.-]+@[\w\.-]+', '', text)

        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,!?-]', ' ', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        return text.strip()

    def tokenize_and_lemmatize(self, text):
        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token)
                  for token in tokens
                  if token.lower() not in self.stop_words
                  and len(token) > 1]  # Remove single-character tokens

        return tokens

    def create_vocabulary(self, texts, min_freq=5, max_vocab_size=50000):
        
        token_counts = Counter()

        for text in tqdm(texts, desc="Building vocabulary"):
            cleaned_text = self.clean_text(text)
            tokens = self.tokenize_and_lemmatize(cleaned_text)
            token_counts.update(tokens)

        # Filter by frequency and vocab size
        filtered_tokens = [token for token, count in token_counts.most_common(max_vocab_size)
                            if count >= min_freq]

        # Create vocabulary
        vocab = {token: idx for idx, token in enumerate(filtered_tokens, start=1)}
        vocab['[PAD]'] = 0
        vocab['[UNK]'] = len(vocab)

        return vocab

    def encode_text(self, text, vocab, max_length=None):
        
        if max_length is None:
            max_length = self.max_length

        cleaned_text = self.clean_text(text)
        tokens = self.tokenize_and_lemmatize(cleaned_text)

        # Convert tokens to indices
        encoded = [vocab.get(token, vocab['[UNK]']) for token in tokens]

        # Pad or truncate to max_length
        if len(encoded) < max_length:
            encoded = encoded + [vocab['[PAD]']] * (max_length - len(encoded))
        else:
            encoded = encoded[:max_length]

        return encoded

    def transform_bert(self, text):
        
        return self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

    def prepare_dataset(self, combined_df, vocab=None, min_freq=5, max_vocab_size=50000):
        texts = combined_df['text'].tolist()

        # Create or use vocabulary
        if vocab is None:
            vocab = self.create_vocabulary(texts, min_freq, max_vocab_size)

        # Encode all texts
        encoded_texts = []
        for text in tqdm(texts, desc="Encoding texts"):
            encoded = self.encode_text(text, vocab)
            encoded_texts.append(encoded)

        # Convert to numpy array
        encoded_texts = np.array(encoded_texts)

        return encoded_texts, vocab

    def prepare_bert_dataset(self, combined_df, test_size=0.2, random_state=42):
        
        # Split data
        train_df, test_df = train_test_split(
            combined_df,
            test_size=test_size,
            random_state=random_state,
            stratify=combined_df['main_category'] if 'main_category' in combined_df.columns else None
        )

        # Convert to HuggingFace datasets
        train_dataset = Dataset.from_pandas(train_df)
        test_dataset = Dataset.from_pandas(test_df)

        # Tokenize function for mapping
        def tokenize_function(examples):
            model_inputs = self.tokenizer(
                examples['text'],
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            # Create labels by shifting input_ids
            labels = model_inputs['input_ids'].clone()
            labels[labels == self.tokenizer.pad_token_id] = -100  # Ignore padding tokens
            model_inputs['labels'] = labels
            return model_inputs

        # Apply tokenization
        train_dataset = train_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=train_dataset.column_names
        )

        test_dataset = test_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=test_dataset.column_names
        )

        return train_dataset, test_dataset

file_paths = ['processed_dataset.parquet']

dataframes = [pd.read_parquet(file) for file in file_paths]


combined_df = pd.concat(dataframes, ignore_index=True)


preprocessor = TextPreprocessor(max_length=512)

print("Preprocessing example:")
sample_text = combined_df['text'].iloc[0]
cleaned_text = preprocessor.clean_text(sample_text)
tokens = preprocessor.tokenize_and_lemmatize(cleaned_text)
print(f"First 10 tokens: {tokens[:10]}")


print("\nCreating vocabulary and encoding texts...")
encoded_texts, vocab = preprocessor.prepare_dataset(
    combined_df,  # Using the entire dataset
    min_freq=2,
    max_vocab_size=10000
)
print(f"Vocabulary size: {len(vocab)}")
print(f"Encoded shape: {encoded_texts.shape}")


print("\nPreparing BERT dataset...")
train_dataset, test_dataset = preprocessor.prepare_bert_dataset(
    combined_df  # Using the entire dataset
)
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


for i in range(3):
    print(f"Example {i+1}:")
    print(f"Input IDs: {train_dataset[i]['input_ids']}")
    print(f"Attention Mask: {train_dataset[i]['attention_mask']}")
    print(f"Labels: {train_dataset[i]['labels']}")
    print()


for i in range(3):
    print(f"Original Text {i+1}:")
    print(combined_df.iloc[i]['text'])
    print(f"Tokenized Input IDs: {train_dataset[i]['input_ids']}")
    print(f"Tokenized Attention Mask: {train_dataset[i]['attention_mask']}")
    print(f"Labels: {train_dataset[i]['labels']}")
    print()

# Prepare datasets dictionary
datasets = DatasetDict({
    'train': train_dataset,
    'eval': test_dataset
})

Preprocessing example:
First 10 tokens: ['australian', 'bureau', 'statistic', 'celebrating', 'international', 'year', 'statistic', '2013', 'ab', 'home']

Creating vocabulary and encoding texts...


Building vocabulary:   0%|          | 0/18987 [00:00<?, ?it/s]

Encoding texts:   0%|          | 0/18987 [00:00<?, ?it/s]

Vocabulary size: 10002
Encoded shape: (18987, 512)

Preparing BERT dataset...


Map:   0%|          | 0/15189 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

Train dataset size: 15189
Test dataset size: 3798
Example 1:
Input IDs: [27034, 13, 22162, 5728, 66, 5857, 9502, 837, 399, 13, 39, 13, 837, 910, 3772, 835, 11818, 75, 330, 15032, 1535, 837, 1394, 5149, 661, 764, 3772, 835, 11818, 75, 330, 15032, 1535, 837, 1394, 5149, 661, 837, 531, 9074, 13, 22162, 5728, 66, 5857, 837, 767, 400, 20956, 4675, 837, 9502, 837, 968, 13910, 837, 1110, 2084, 764, 6989, 890, 10726, 773, 328, 395, 295, 760, 1650, 2883, 2060, 9799, 764, 11384, 11234, 595, 24071, 2107, 7646, 395, 5496, 837, 2147, 3947, 4236, 764, 640, 8033, 3947, 2005, 837, 561, 8659, 22121, 1711, 640, 764, 2626, 4202, 3190, 2627, 4939, 1057, 12, 2902, 561, 1577, 1811, 640, 1110, 561, 2245, 1334, 764, 4457, 10927, 14709, 540, 837, 2936, 588, 561, 2270, 3190, 764, 1881, 1110, 2497, 2643, 1545, 6164, 531, 11818, 75, 330, 4193, 33138, 837, 9431, 4745, 540, 9007, 2540, 2263, 3393, 837, 10607, 37196, 1255, 764, 20788, 6596, 826, 717, 837, 12361, 773, 328, 395, 295, 6989, 890, 5000, 12120, 764, 5201,

In [7]:
import pickle

with open('tokenized_datasets1.pkl', 'wb') as f:
    pickle.dump(datasets, f)


In [8]:
with open('tokenized_datasets.pkl', 'rb') as f:
    loaded_datasets = pickle.load(f)

In [9]:
# Access the datasets
train_dataset = loaded_datasets['train']
test_dataset = loaded_datasets['eval']

In [10]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 45877
})

In [22]:
test_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 11470
})