# Data Preprocessing
## Download necessary nltk resources

In [1]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zihanliang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zihanliang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zihanliang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## SMM4H-2025-Task5-Train_subtask1 Preprocessing

In [3]:
# -*- coding: utf-8 -*-
"""
Data Preprocessing and Data Augmentation (Hierarchical Processing Version)

This code is based on the SMM4H-2025-Task5-Train_subtask1.csv file and performs the following steps on the original news text:
1. Clean the text: remove HTML tags, URLs, redundant spaces, etc.;
2. Manual segmentation (Chunking): use sentence boundaries to segment the text, ensuring that the token count of each segment does not exceed 512.
   If a sentence itself is longer than 512 tokens, it will be further split to preserve complete information.
3. Data augmentation: perform synonym replacement on the text, avoiding replacement of domain keywords (e.g., FDA, Listeria, etc.);
4. Finally, save the cleaned text, segmented text list, and augmented text into a new CSV file.

Please ensure that the SMM4H-2025-Task5-Train_subtask1.csv file is located in the same directory as the Notebook, or modify the file paths in the code.
"""

import re
import random
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from transformers import AutoTokenizer

def clean_text(text: str) -> str:
    """
    Clean the text:
      - Remove HTML tags
      - Replace URLs with "URL"
      - Remove extra spaces
    """
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', 'URL', text, flags=re.MULTILINE)  # Replace URLs
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()

def split_long_sentence(sentence: str, tokenizer, max_length: int) -> list:
    """
    Split a sentence into smaller chunks if its token count exceeds max_length.
    The sentence is tokenized, then split into chunks, and each chunk is converted back to a string.
    """
    tokens = tokenizer.tokenize(sentence)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i+max_length]
        # Convert tokens back to string; this may not be perfectly formatted but preserves all information.
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

def segment_text(text: str, tokenizer, max_length: int = 512) -> list:
    """
    Segment the text using sentence boundaries (Chunking) for hierarchical processing.
    If the total token count of the text does not exceed max_length, return the original text directly;
    Otherwise, split the text by sentences. For each sentence, if its token count exceeds max_length,
    further split it into smaller chunks. Then, gradually combine chunks/sentences ensuring that each segment's token count does not exceed max_length.
    """
    tokens = tokenizer.tokenize(text)
    if len(tokens) <= max_length:
        return [text]
    
    # Split text into sentences using nltk
    sentences = sent_tokenize(text)
    segments = []
    current_segment = ""
    for sentence in sentences:
        # Check if the sentence itself exceeds max_length tokens
        sentence_tokens = tokenizer.tokenize(sentence)
        if len(sentence_tokens) > max_length:
            # Further split the long sentence into smaller chunks
            parts = split_long_sentence(sentence, tokenizer, max_length)
        else:
            parts = [sentence]
        
        # Process each part/chunk
        for part in parts:
            if current_segment:
                combined = (current_segment + " " + part).strip()
            else:
                combined = part
            if len(tokenizer.tokenize(combined)) <= max_length:
                current_segment = combined
            else:
                if current_segment:
                    segments.append(current_segment)
                current_segment = part
    if current_segment:
        segments.append(current_segment)
    return segments

def get_synonyms(word: str) -> list:
    """
    Use nltk's WordNet to get a list of synonyms for the word,
    filtering out candidates that are identical to the original word and replacing underscores with spaces.
    """
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            candidate = lemma.name().replace("_", " ")
            if candidate.lower() != word.lower():
                synonyms.add(candidate)
    return list(synonyms)

def augment_text(text: str, replacement_prob: float = 0.1, skip_words: set = None) -> str:
    """
    Perform data augmentation on the text:
      - Replace each word with a synonym with a certain probability (replacement_prob)
      - Protect domain keywords (e.g., FDA, Listeria, E. coli, etc.) from replacement
    """
    if skip_words is None:
        skip_words = {"FDA", "Listeria", "E. coli", "Salmonella", "Maytag", "CDC"}
    
    words = text.split()
    augmented_words = []
    for word in words:
        # Only replace alphabetic words that are not in the protection list
        if word.isalpha() and word not in skip_words and random.random() < replacement_prob:
            synonyms = get_synonyms(word)
            if synonyms:
                new_word = random.choice(synonyms)
                augmented_words.append(new_word)
            else:
                augmented_words.append(word)
        else:
            augmented_words.append(word)
    return " ".join(augmented_words)

def process_data(input_file: str,
                 output_file: str,
                 tokenizer_name: str = "bert-base-uncased",
                 max_length: int = 512,
                 augmentation_prob: float = 0.1):
    """
    Overall data preprocessing and augmentation process:
      1. Read CSV data (the file contains the fields: docid, text, Subtask1_Label)
      2. Clean each article to generate cleaned_text
      3. Generate segments (in list form) using manual segmentation based on max_length
      4. Perform data augmentation on the cleaned text to generate augmented_text
      5. Save the results into a new CSV file
    """
    df = pd.read_csv(input_file)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    cleaned_texts = []
    segments_list = []
    augmented_texts = []
    
    for idx, row in df.iterrows():
        original_text = str(row["text"])
        # 1. Clean the text
        cleaned = clean_text(original_text)
        # 2. Manual segmentation (hierarchical processing): obtain a list of segments with token count not exceeding max_length
        segments = segment_text(cleaned, tokenizer, max_length)
        # 3. Data augmentation (synonym replacement)
        augmented = augment_text(cleaned, replacement_prob=augmentation_prob)
        
        cleaned_texts.append(cleaned)
        segments_list.append(segments)
        augmented_texts.append(augmented)
    
    df["cleaned_text"] = cleaned_texts
    df["segments"] = segments_list
    df["augmented_text"] = augmented_texts
    
    df.to_csv(output_file, index=False)
    print(f"[INFO] Preprocessing complete. Output saved to: {output_file}")

# ===============================================
# Run the following code in Jupyter Notebook
# ===============================================
input_file = "SMM4H-2025-Task5-Train_subtask1.csv"  # Input file path
output_file = "preprocessed_SMM4H-2025-Task5-Train_subtask1.csv"  # Output file path
tokenizer_name = "bert-base-uncased"  # Change the model name if needed
max_seq_length = 512  # Maximum token count for Transformer models
augmentation_probability = 0.1  # Probability for synonym replacement

process_data(
    input_file=input_file,
    output_file=output_file,
    tokenizer_name=tokenizer_name,
    max_length=max_seq_length,
    augmentation_prob=augmentation_probability
)

Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors


[INFO] Preprocessing complete. Output saved to: preprocessed_SMM4H-2025-Task5-Train_subtask1.csv


## SMM4H-2025-Task5-Validation_subtask1.csv Preprocessing

In [4]:
# -*- coding: utf-8 -*-
"""
Data Preprocessing and Data Augmentation (Hierarchical Processing Version)

This code is based on the SMM4H-2025-Task5-Validation_subtask1.csv file and performs the following steps on the original news text:
1. Clean the text: remove HTML tags, URLs, redundant spaces, etc.;
2. Manual segmentation (Chunking): use sentence boundaries to segment the text, ensuring that the token count of each segment does not exceed 512.
   If a sentence itself is longer than 512 tokens, it will be further split to preserve complete information.
3. Data augmentation: perform synonym replacement on the text, avoiding replacement of domain keywords (e.g., FDA, Listeria, etc.);
4. Finally, save the cleaned text, segmented text list, and augmented text into a new CSV file.

Please ensure that the SMM4H-2025-Task5-Validation_subtask1.csv file is located in the same directory as the Notebook, or modify the file paths in the code.
"""

import re
import random
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from transformers import AutoTokenizer

def clean_text(text: str) -> str:
    """
    Clean the text:
      - Remove HTML tags
      - Replace URLs with "URL"
      - Remove extra spaces
    """
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', 'URL', text, flags=re.MULTILINE)  # Replace URLs
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()

def split_long_sentence(sentence: str, tokenizer, max_length: int) -> list:
    """
    Split a sentence into smaller chunks if its token count exceeds max_length.
    The sentence is tokenized, then split into chunks, and each chunk is converted back to a string.
    """
    tokens = tokenizer.tokenize(sentence)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i+max_length]
        # Convert tokens back to string; this may not be perfectly formatted but preserves all information.
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

def segment_text(text: str, tokenizer, max_length: int = 512) -> list:
    """
    Segment the text using sentence boundaries (Chunking) for hierarchical processing.
    If the total token count of the text does not exceed max_length, return the original text directly;
    Otherwise, split the text by sentences. For each sentence, if its token count exceeds max_length,
    further split it into smaller chunks. Then, gradually combine chunks/sentences ensuring that each segment's token count does not exceed max_length.
    """
    tokens = tokenizer.tokenize(text)
    if len(tokens) <= max_length:
        return [text]
    
    # Split text into sentences using nltk
    sentences = sent_tokenize(text)
    segments = []
    current_segment = ""
    for sentence in sentences:
        # Check if the sentence itself exceeds max_length tokens
        sentence_tokens = tokenizer.tokenize(sentence)
        if len(sentence_tokens) > max_length:
            # Further split the long sentence into smaller chunks
            parts = split_long_sentence(sentence, tokenizer, max_length)
        else:
            parts = [sentence]
        
        # Process each part/chunk
        for part in parts:
            if current_segment:
                combined = (current_segment + " " + part).strip()
            else:
                combined = part
            if len(tokenizer.tokenize(combined)) <= max_length:
                current_segment = combined
            else:
                if current_segment:
                    segments.append(current_segment)
                current_segment = part
    if current_segment:
        segments.append(current_segment)
    return segments

def get_synonyms(word: str) -> list:
    """
    Use nltk's WordNet to get a list of synonyms for the word,
    filtering out candidates that are identical to the original word and replacing underscores with spaces.
    """
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            candidate = lemma.name().replace("_", " ")
            if candidate.lower() != word.lower():
                synonyms.add(candidate)
    return list(synonyms)

def augment_text(text: str, replacement_prob: float = 0.1, skip_words: set = None) -> str:
    """
    Perform data augmentation on the text:
      - Replace each word with a synonym with a certain probability (replacement_prob)
      - Protect domain keywords (e.g., FDA, Listeria, E. coli, etc.) from replacement
    """
    if skip_words is None:
        skip_words = {"FDA", "Listeria", "E. coli", "Salmonella", "Maytag", "CDC"}
    
    words = text.split()
    augmented_words = []
    for word in words:
        # Only replace alphabetic words that are not in the protection list
        if word.isalpha() and word not in skip_words and random.random() < replacement_prob:
            synonyms = get_synonyms(word)
            if synonyms:
                new_word = random.choice(synonyms)
                augmented_words.append(new_word)
            else:
                augmented_words.append(word)
        else:
            augmented_words.append(word)
    return " ".join(augmented_words)

def process_data(input_file: str,
                 output_file: str,
                 tokenizer_name: str = "bert-base-uncased",
                 max_length: int = 512,
                 augmentation_prob: float = 0.1):
    """
    Overall data preprocessing and augmentation process:
      1. Read CSV data (the file contains the fields: docid, text, Subtask1_Label)
      2. Clean each article to generate cleaned_text
      3. Generate segments (in list form) using manual segmentation based on max_length
      4. Perform data augmentation on the cleaned text to generate augmented_text
      5. Save the results into a new CSV file
    """
    df = pd.read_csv(input_file)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    cleaned_texts = []
    segments_list = []
    augmented_texts = []
    
    for idx, row in df.iterrows():
        original_text = str(row["text"])
        # 1. Clean the text
        cleaned = clean_text(original_text)
        # 2. Manual segmentation (hierarchical processing): obtain a list of segments with token count not exceeding max_length
        segments = segment_text(cleaned, tokenizer, max_length)
        # 3. Data augmentation (synonym replacement)
        augmented = augment_text(cleaned, replacement_prob=augmentation_prob)
        
        cleaned_texts.append(cleaned)
        segments_list.append(segments)
        augmented_texts.append(augmented)
    
    df["cleaned_text"] = cleaned_texts
    df["segments"] = segments_list
    df["augmented_text"] = augmented_texts
    
    df.to_csv(output_file, index=False)
    print(f"[INFO] Preprocessing complete. Output saved to: {output_file}")

# ===============================================
# Run the following code in Jupyter Notebook
# ===============================================
input_file = "SMM4H-2025-Task5-Validation_subtask1.csv"  # Input file path
output_file = "preprocessed_SMM4H-2025-Task5-Validation_subtask1.csv"  # Output file path
tokenizer_name = "bert-base-uncased"  # Change the model name if needed
max_seq_length = 512  # Maximum token count for Transformer models
augmentation_probability = 0.1  # Probability for synonym replacement

process_data(
    input_file=input_file,
    output_file=output_file,
    tokenizer_name=tokenizer_name,
    max_length=max_seq_length,
    augmentation_prob=augmentation_probability
)

Token indices sequence length is longer than the specified maximum sequence length for this model (739 > 512). Running this sequence through the model will result in indexing errors


[INFO] Preprocessing complete. Output saved to: preprocessed_SMM4H-2025-Task5-Validation_subtask1.csv
