In [36]:
import logging
import pandas as pd
import numpy as np
import nltk
import random
import string
import re
from gensim.downloader import load as glove_embeddings_loader
from nltk.corpus import stopwords as nltk_stopwords
from pathlib import Path
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

# Set up logging
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [29]:
# Path configuration
DATA_DIR = Path('./data')
TRAIN_PATH = DATA_DIR / 'train.csv'
DEV_PATH = DATA_DIR / 'dev.csv'
AUGMENTED_DATA_PATH = DATA_DIR / 'train_augmented.csv'

In [3]:
glove_embeddings = glove_embeddings_loader('glove-wiki-gigaword-300')

2025-03-27 18:40:51 - loading projection weights from C:\Users\willi/gensim-data\glove-wiki-gigaword-300\glove-wiki-gigaword-300.gz


2025-03-27 18:41:56 - KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from C:\\Users\\willi/gensim-data\\glove-wiki-gigaword-300\\glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-03-27T18:41:56.186043', 'gensim': '4.3.3', 'python': '3.11.2 (tags/v3.11.2:878ead1, Feb  7 2023, 16:38:35) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'load_word2vec_format'}


In [66]:
def get_synonyms(word: str, noise_level: float=0.005, topn: int=10):
    # Check if the word exists in the embeddings
    if word not in glove_embeddings:
        return []
    
    # Get the word's embedding vector
    original_vec = glove_embeddings[word]
    
    # Add random Gaussian noise to the vector
    noise = np.random.normal(loc=0.0, scale=noise_level, size=original_vec.shape)
    noisy_vec = original_vec + noise
    
    # Retrieve the topn + 1 words (as one of them will be the word itself) closest to the noisy vector
    # This will return a list of tuples (word, similarity)
    similar_words = glove_embeddings.most_similar(positive=[noisy_vec], topn=topn + 1)
    
    # Return just the words from the list of tuples
    return [syn for syn, similarity in similar_words if syn != word]


In [79]:
stopwords = set(nltk_stopwords.words('english'))

def remove_stopwords(text):
    text = text.lower()
    
    # Remove any non-alphabetic characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Split hyphenated words
    text = re.sub(r'-', ' ', text)
    
    # Remove any double spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    return ' '.join([word for word in text.split() if word not in stopwords])

In [80]:
train_df = pd.read_csv(TRAIN_PATH)
augmented_df = pd.read_csv(AUGMENTED_DATA_PATH)
dev_df = pd.read_csv(DEV_PATH)

train_df['POS'] = train_df['Evidence'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))
original_evidences_pos = train_df['POS'].tolist()
original_evidences = train_df['Evidence'].tolist()

preprocessed_evidences = train_df['Evidence'].apply(remove_stopwords).tolist()
corresponding_claim = train_df['Claim'].apply(remove_stopwords).tolist()

In [84]:
def process_evidence_words(claim_words: set, 
                           evidence_words: list, 
                           original_pos_tags: dict) -> list:
    """
    Filter evidence words to find potential replacement candidates.
    
    Args:
        claim_words (set): Set of words in the claim.
        evidence_words (list): List of words in the evidence.
        original_pos_tags (dict): Dictionary of POS tags for the original evidence.
        
    Returns:
        list: List of potential replacement candidates.
    """
    common_words = set(evidence_words) & claim_words
    potential_replacements = []
    
    for word in evidence_words:
        # Skip if word is:
        # 1. Common between claim and evidence
        # 2. Substring of any claim word or vice versa
        # 3. Not in POS tags dictionary
        if (word in common_words or
            any(word in claim_word or claim_word in word for claim_word in claim_words) or
            word not in original_pos_tags):
            continue
        potential_replacements.append(word)
    
    return potential_replacements

def find_valid_replacements(word_to_replace: str, 
                            synonyms: list, 
                            original_evidence: str, 
                            original_pos_tags: dict) -> tuple[bool, str]:
    """
    Find a valid synonym replacement that maintains POS tag.
    
    Args:
        word_to_replace (str): The word to replace.
        synonyms (list): List of synonyms to choose from.
        original_evidence (str): The original evidence.
        original_pos_tags_dict (dict): Dictionary of POS tags for the original evidence.
        
    Returns:
        tuple[bool, str]: A tuple containing a boolean indicating if a valid replacement was found and the replacement word.
    """
    for synonym in synonyms:
        # Replace word in evidence
        pattern = r'\b' + re.escape(word_to_replace) + r'\b'
        new_evidence = re.sub(pattern, synonym, original_evidence)
        
        # Get POS tags for new evidence
        new_evidence_pos = nltk.pos_tag(nltk.word_tokenize(new_evidence))
        new_evidence_pos_dict = {word.lower(): [] for word, _ in new_evidence_pos}
        for word, tag in new_evidence_pos:
            new_evidence_pos_dict[word.lower()].append(tag)
        
        # Check if POS tags match
        if (word_to_replace in original_pos_tags and 
            synonym.lower() in new_evidence_pos_dict and 
            original_pos_tags[word_to_replace] == new_evidence_pos_dict[synonym.lower()]):
            return True, synonym
            
    return False, ""

In [85]:
def augment_data(train_df: pd.DataFrame, 
                 preprocessed_evidences: list, 
                 corresponding_claim: list, 
                 original_evidences: list, 
                 original_pos_tags: list, 
                 file_name: str,
                 add_original_evidence: bool = False,
                 batch_size: int = 1000):
    """
    Main function to create augmented dataset with synonym replacements.
    
    Args:
        train_df (pd.DataFrame): The training dataframe.
        preprocessed_evidences (list): List of preprocessed evidences.
        corresponding_claim (list): List of claims corresponding to the preprocessed evidences.
        original_evidences (list): List of original evidences.
        original_pos_tags (list): List of original POS tags.
        batch_size (int): The batch size for saving to CSV.
    """
    if Path(file_name).exists():
        overwrite = input(f"File {file_name} already exists. Would you like to overwrite it? (y/n) ")
        if overwrite != 'y':
            return
        
    cols = ["Claim", "Evidence", "label"]
    if add_original_evidence:
        cols.append("Original Evidence")
    
    synyonm_replaced_df = pd.DataFrame(columns=cols)
    batch_counter = 0
    
    for idx, (claim, evidence) in tqdm(enumerate(zip(corresponding_claim, preprocessed_evidences))):
        # Prepare POS tags dictionary
        pos_tags = original_pos_tags[idx]
        pos_tags_dict = {word.lower(): [] for word, _ in pos_tags}
        for word, tag in pos_tags:
            pos_tags_dict[word.lower()].append(tag)
        
        # Get potential words to replace
        claim_words = set(claim.split())
        evidence_words = evidence.split()
        potential_replacements = process_evidence_words(claim_words, evidence_words, pos_tags_dict)
        
        # Skip if not enough words to replace
        number_of_replacements = len(potential_replacements) // 3
        if number_of_replacements < 1:
            continue
        
        # Find replacements
        words_to_replace = random.sample(potential_replacements, k=number_of_replacements)
        final_word_replacement_map = {}
        
        for word in words_to_replace:
            synonyms = get_synonyms(word, noise_level=0.0001, topn=20)
            found, synonym = find_valid_replacements(
                word, synonyms, original_evidences[idx], pos_tags_dict
            )
            if found:
                # print(f"Replacement found: {word} -> {synonym}")
                final_word_replacement_map[word] = synonym
        
        # Skip if not enough valid replacements found
        if len(final_word_replacement_map) < len(words_to_replace) * 0.6:
            # print(f"Insufficient replacements found for {words_to_replace}")
            continue
        
        # Create new evidence with replacements
        new_evidence = original_evidences[idx]
        for word, replacement in final_word_replacement_map.items():
            pattern = r'\b' + re.escape(word) + r'\b'
            new_evidence = re.sub(pattern, replacement, new_evidence)
        
        # Add to dataframe
        new_row = {
            "Claim": [train_df['Claim'][idx]],
            "Evidence": [new_evidence],
            "label": [train_df['label'][idx]]
        }
        
        if add_original_evidence:
            new_row["Original Evidence"] = [original_evidences[idx]]
            
        new_row = pd.DataFrame(new_row)
        
        synyonm_replaced_df = pd.concat([synyonm_replaced_df, new_row], ignore_index=True)
        
        # Save batch if size threshold reached
        if len(synyonm_replaced_df) >= batch_size:
            mode = 'w' if batch_counter == 0 else 'a'
            header = batch_counter == 0
            synyonm_replaced_df.to_csv(file_name, index=False, mode=mode, header=header)
            synyonm_replaced_df = pd.DataFrame(columns=cols)
            batch_counter += 1
    
    # Save any remaining data
    if len(synyonm_replaced_df) > 0:
        mode = 'w' if batch_counter == 0 else 'a'
        header = batch_counter == 0
        
        synyonm_replaced_df.to_csv(file_name, index=False, mode=mode, header=header)

# Run the augmentation
augment_data(train_df, 
             preprocessed_evidences, 
             corresponding_claim, 
             original_evidences, 
             original_evidences_pos,
             add_original_evidence=True,
             file_name='data/train_augmented_synonyms_with_original_evidence.csv')

4456it [12:40,  5.86it/s]


KeyboardInterrupt: 