In [1]:
import logging
import pandas as pd
from pathlib import Path
from googletrans import Translator
from typing import AsyncGenerator


# Set up logging
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)

In [2]:
# Path configuration
DATA_DIR = Path('./data')
TRAIN_PATH = DATA_DIR / 'train.csv'
DEV_PATH = DATA_DIR / 'dev.csv'
AUGMENTED_DATA_PATH = DATA_DIR / 'train_augmented.csv'

In [3]:
def format_text(text: str) -> str:
    # Replace all double quotes with single quotes
    text = text.replace('\"', "\'")
    
    # If the text starts with a double quote, remove it
    text = text[1:] if text.startswith('"') else text

    # If the text ends with a double quote, remove it
    text = text[:-1] if text.endswith('"') else text
    
    # If the text contains a commma, or a single quote, wrap it in double quotes
    text = f'"{text}"' if ',' in text or "'" in text else text
        
    # If the text contains unrepresentable characters, replace them with a space
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    return text


async def back_translate(text: str, src='en', intermediate='fr') -> str:
    async with Translator() as translator:
        to_french = await translator.translate(text, src=src, dest=intermediate, )
        back_to_english = await translator.translate(
            to_french.text, 
            src=intermediate, 
            dest=src
        )
        return back_to_english.text


async def process_data_stream(train_path: Path, dev_path: Path, augmented_data: pd.DataFrame) -> AsyncGenerator[dict, None]:
    train_df = pd.read_csv(train_path)
    positive_examples = train_df[train_df['label'] == 1]
    
    for idx, row in positive_examples.iterrows():
        if row['Evidence'] in augmented_data['Original Evidence'].values:
            # Skip this example as it's already been processed
            logging.info(f"Skipping example {idx} as it's already been processed")
            continue
        
        original_evidence = row['Evidence']
        translated_evidence = await back_translate(original_evidence)
        
        yield {
            'claim': row['Claim'],
            'original_evidence': original_evidence,
            'translated_evidence': translated_evidence,
            'index': idx
        }

In [6]:
augmented_data = pd.read_csv(AUGMENTED_DATA_PATH)

async for item in process_data_stream(TRAIN_PATH, DEV_PATH, augmented_data):
    translated_evidence = format_text(item['translated_evidence'])
    original_evidence = format_text(item['original_evidence'])
    claim = format_text(item['claim'])
    
    with open(AUGMENTED_DATA_PATH, 'a') as f:
        f.write(f'{claim},{translated_evidence},{original_evidence},1\n')
        
    # Log the augmented data
    logging.info(f"Added augmented data for claim: {claim[:30]}...")
    
    # Save to the augmented dataframe as well
    new_row = pd.DataFrame({
        'Claim': [claim], 
        'Evidence': [translated_evidence], 
        'Original Evidence': [original_evidence],
        'label': [1]
    })
    augmented_data = pd.concat([augmented_data, new_row], ignore_index=True)

2025-03-27 16:28:29 - HTTP Request: GET https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=fr&hl=fr&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&otf=1&ssel=0&tsel=0&tk=xxxx&q=a+2001+survey+by+the+European+Commission+found+that+%22only+10.1+percent+of+Europeans+trusted+the+nuclear+industry "HTTP/2 200 OK"
2025-03-27 16:28:29 - HTTP Request: GET https://translate.googleapis.com/translate_a/single?client=gtx&sl=fr&tl=en&hl=en&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&otf=1&ssel=0&tsel=0&tk=xxxx&q=Une+enqu%C3%AAte+de+2001+de+la+Commission+europ%C3%A9enne+a+r%C3%A9v%C3%A9l%C3%A9+que+%22seulement+10%2C1%25+des+Europ%C3%A9ens+faisaient+confiance+%C3%A0+l%27industrie+nucl%C3%A9aire "HTTP/2 200 OK"
2025-03-27 16:28:29 - Added augmented data for claim: We should further exploit nucl...
2025-03-27 16:28:29 - Skipping example 7 as it's already been processed
2025-03-27 16:28:29 - Skipping example 9 as it's al

CancelledError: 