# Data Preprocessing

In [12]:
import unicodedata
import re
import string
import pandas as pd

def read_files(source_path, target_path):
    """
    Reads the source and target files and returns a DataFrame.
    
    Args:
        source_path (str): Path to the source language file.
        target_path (str): Path to the target language file.
    
    Returns:
        pd.DataFrame: DataFrame with 'source' and 'target' columns.
    """
    print(f"Reading source file: {source_path}")
    with open(source_path, 'r', encoding='utf-8') as src_file:
        sources = src_file.readlines()
    
    print(f"Reading target file: {target_path}")
    with open(target_path, 'r', encoding='utf-8') as tgt_file:
        targets = tgt_file.readlines()
    
    if len(sources) != len(targets):
        raise ValueError("Source and target files have different number of lines.")
    
    print(f"Total number of sentence pairs: {len(sources)}")
    
    # Create DataFrame
    data = pd.DataFrame({
        'source': [s.strip() for s in sources],
        'target': [t.strip() for t in targets]
    })
    print("read_files", len(data))
    return data

def clean_data(df, max_words=200, length_ratio=1.5):
    """
    Cleans the DataFrame by removing duplicates, filtering by length, and enforcing length ratio.
    
    Args:
        df (pd.DataFrame): DataFrame with 'source' and 'target' columns.
        max_words (int): Maximum number of words allowed in a sentence.
        length_ratio (float): Maximum allowed length ratio between source and target.
    
    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    print("Initial number of sentence pairs:", len(df))
    
    # Remove duplicate sentence pairs
    df.drop_duplicates(inplace=True)
    print("After removing duplicates:", len(df))
    
    # Calculate word counts
    df['src_word_count'] = df['source'].apply(lambda x: len(x.split()))
    df['tgt_word_count'] = df['target'].apply(lambda x: len(x.split()))
    
    # Filter out sentences longer than max_words
    df = df[(df['src_word_count'] <= max_words) & (df['tgt_word_count'] <= max_words)]
    print(f"After removing sentences longer than {max_words} words:", len(df))
    
    # Calculate length ratio
    df['ratio'] = df['src_word_count'] / df['tgt_word_count']
    
    # Keep sentence pairs within the length_ratio
    df = df[(df['ratio'] <= length_ratio) & (df['ratio'] >= 1/length_ratio)]
    print(f"After enforcing length ratio <= {length_ratio}:", len(df))
    
    # Drop auxiliary columns
    df.drop(['src_word_count', 'tgt_word_count', 'ratio'], axis=1, inplace=True)
    
    return df

def remove_non_printable(df):
    """
    Removes sentences containing non-printable or control characters.
    
    Args:
        df (pd.DataFrame): DataFrame with 'source' and 'target' columns.
    
    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    def is_printable(text):
        return all(c.isprintable() for c in text)
    
    initial_count = len(df)
    
    df = df[df['source'].apply(is_printable)]
    df = df[df['target'].apply(is_printable)]
    
    print(f"Removed sentences with non-printable characters: {initial_count - len(df)}")
    return df

def normalize_text(df):
    """
    Normalizes whitespace and Unicode characters in the DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame with 'source' and 'target' columns.
    
    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    def normalize(text):
        text = unicodedata.normalize('NFKC', text)  # Normalize Unicode
        text = re.sub(r'\s+', ' ', text)           # Replace multiple spaces with single space
        return text.strip()
    
    df['source'] = df['source'].apply(normalize)
    df['target'] = df['target'].apply(normalize)
    
    print("Normalized whitespace and Unicode characters.")
    return df

def remove_unwanted_symbols(df, keep_punct=None):
    """
    Removes all symbols from the text except for the specified punctuation marks.
    
    Args:
        df (pd.DataFrame): DataFrame with 'source' and 'target' columns.
        keep_punct (list or set): A collection of punctuation marks to retain.
                                 Defaults to a standard set if None.
    
    Returns:
        pd.DataFrame: Cleaned DataFrame with unwanted symbols removed.
    """
    if keep_punct is None:
        # Define default punctuation to keep
        keep_punct = set(['.', ',', '?', '!', ':', ';', '"', "'", '(', ')', '[', ']', '{', '}', '-', '–', '—'])
    
    # Create a regex pattern to match unwanted symbols
    # We keep alphanumerics, whitespace, and the specified punctuation
    allowed_chars = ''.join(keep_punct)
    # Escape punctuation for regex if necessary
    allowed_chars = re.escape(allowed_chars)
    
    # Pattern to match any character that is not a word character, whitespace, or allowed punctuation
    pattern = f'[^\w\s{allowed_chars}]'
    
    def clean_text(text):
        # Remove unwanted symbols
        text = re.sub(pattern, '', text)
        # Optionally, you can also normalize whitespace here if needed
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    # Apply the cleaning function to both 'source' and 'target' columns
    df['source'] = df['source'].apply(clean_text)
    df['target'] = df['target'].apply(clean_text)
    
    print("Removed unwanted symbols, retaining essential punctuation.")
    return df

# Step 1: Read the files
df = read_files("data-files/original_dataset/opus.wikipedia.en-fr/Wikipedia.en-fr.en","data-files/original_dataset/opus.wikipedia.en-fr/Wikipedia.en-fr.fr")

# Step 2: Clean the data
df_cleaned = clean_data(df, max_words=200, length_ratio=1.5)
df_cleaned = remove_non_printable(df_cleaned)
df_cleaned = normalize_text(df_cleaned)
df_cleaned = remove_unwanted_symbols(df_cleaned)

Reading source file: /kaggle/input/wikipedia-en-fr/Wikipedia.en-fr.en
Reading target file: /kaggle/input/wikipedia-en-fr/Wikipedia.en-fr.fr
Total number of sentence pairs: 818302
Initial number of sentence pairs: 818302
After removing duplicates: 803704
After removing sentences longer than 200 words: 801392
After enforcing length ratio <= 1.5: 691348


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ratio'] = df['src_word_count'] / df['tgt_word_count']


Removed sentences with non-printable characters: 126
Normalized whitespace and Unicode characters.
Removed unwanted symbols, retaining essential punctuation.


# Getting COMET Score

In [13]:
from comet import download_model, load_from_checkpoint
model_path = download_model("Unbabel/wmt20-comet-qe-da")
model = load_from_checkpoint(model_path)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [None]:
from tqdm import tqdm

# Define a function to predict COMET scores with a progress bar
def get_comet_scores(df, batch_size=64):
    scores = []
    
    # Create a tqdm progress bar
    tqdm_bar = tqdm(total=len(df), desc="Processing COMET Scores", ncols=100)

    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        inputs = [{"src": row["source"], "mt": row["target"]} for _, row in batch.iterrows()]

        # Predict scores for the current batch
        model_output = model.predict(inputs, batch_size=batch_size, gpus=1, progress_bar=False)

        # Extract the scores from the output
        if isinstance(model_output, list) and isinstance(model_output[0], tuple):
            batch_scores = model_output[0][1]  # Scores are in the second element of the first tuple
        elif 'scores' in model_output:
            batch_scores = model_output['scores']
        else:
            raise ValueError("Unexpected model output format")

        scores.extend(batch_scores)

        # Update the progress bar by the batch size
        tqdm_bar.update(len(batch))

    tqdm_bar.close()  # Close the progress bar once done
    return scores


Processing COMET Scores:  98%|███████████████████████████▎| 337536/345611 [7:44:34<10:14, 13.15it/s]

In [None]:
df_50_part1 = df_cleaned.iloc[:int(0.25 * len(df_cleaned))]
df_50_part2 = df_cleaned.iloc[int(0.25 * len(df_cleaned)):int(0.5 * len(df_cleaned))]

# Get COMET scores for the DataFrame
df_50_part1['comet_score'] = get_comet_scores(df_50_part1)
df_50_part2['comet_score'] = get_comet_scores(df_50_part2)

# Save the DataFrame with the new column
df_50_part1.to_csv('data-files/original_dataset/dataset_with_comet_Part1.csv', index=False)
df_50_part2.to_csv('data-files/original_dataset/dataset_with_comet_Part2.csv', index=False)

# Removing Sentences from different Language

In [None]:
import pandas as pd

# Load the first CSV file
df1 = pd.read_csv('en-fr/dataset_with_comet_Part1.csv')

# Load the second CSV file
df2 = pd.read_csv('en-fr/dataset_with_comet_Part2.csv')

# Combine the two dataframes
combined_df = pd.concat([df1, df2])

In [None]:
import pandas as pd
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

# Apply language detection to the 'text' column
# Replace 'text' with the actual column name containing the text you want to analyze
combined_df['source_detected_language'] = combined_df['source'].apply(detect_language)
combined_df['target_detected_language'] = combined_df['target'].apply(detect_language)

In [None]:
combined_df = combined_df[combined_df['source_detected_language'] == 'en']
combined_df = combined_df[combined_df['target_detected_language'] == 'fr']

In [None]:
# Sort the dataframe by comet_score in descending order
df_sorted = combined_df.sort_values(by='comet_score', ascending=False)

# Dataset Splitting

In [None]:
from sklearn.model_selection import train_test_split

# Shuffle the dataset
df_shuffled = df_sorted.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data into train+validation and test sets (80% / 20%)
train, test = train_test_split(df_shuffled, test_size=0.2307692, random_state=42)

# Split the train+validation set into train and validation sets (75% / 25%, which is 60% / 20% of the total)
test, val = train_test_split(test, test_size=0.50, random_state=42)

# Save the splits to CSV files
train.to_csv('data-files/filtered_dataset/train.csv', index=False)
val.to_csv('data-files/filtered_dataset/validation.csv', index=False)
test.to_csv('data-files/filtered_dataset/test.csv', index=False)

# Print the sizes of each split
print(f"Total samples: {len(df_shuffled)}")
print(f"Train samples: {len(train)} ({len(train)/len(df_shuffled)*100:.2f}%)")
print(f"Validation samples: {len(val)} ({len(val)/len(df_shuffled)*100:.2f}%)")
print(f"Test samples: {len(test)} ({len(test)/len(df_shuffled)*100:.2f}%)")