# Edit distance 

To determine how similar or dissimilar two text  samples are. It quantifies the distance as a numeric value computed through the number of minimum possible operations required to reach from source text (or simply text1) to target text (or simply text 2). This operation can be performed at both word and character level. Following are the three different implementations of edit distance where the difference is the type of operations allowed and their costs.

The method does not use any packages other than basic `random` and `string` packages to implement. The environment random seed is preserved for reproducibility

In [2]:
from utils import (simple_edit_distance,
            levenshtein_edit_distance, damerau_levenshtein_distance)

Here we can take any two text samples on which the methods can be called. For example, we consider toxic social media post and its detoxified version.

The method offers three variants of edit distance 
- Simple edit distance
- Levenshtein edit distance
- Damerau Levenshtein edit distance

The method can be applied to the text pairs in 2 levels
- At word level
- At character level


In [49]:
import csv

def _pair_texts(pairs, method, level):
    dist = []
    for t1, t2 in pairs:
        if method in ('simple', 'all'):
            dist_s = simple_edit_distance(t1, t2, level=level)
        if method in ('levenshtein', 'all'):
            dist_l = levenshtein_edit_distance(t1, t2, level=level)
        if method in ('damerau', 'all'):
            dist_d = damerau_levenshtein_distance(t1, t2, level=level)
        dist.append([t1, t2, dist_s, dist_l, dist_d])
    return dist


def batch_edit_distance(csv_path=None, texts=None, method='all', level='w'):
    """
    Calculate edit distances for text pairs from a CSV file or a list of texts.
    
    Args:
        csv_path (str, optional): Path to the CSV file. If not provided, 'texts' must be given.
        texts (list, optional): List of texts to compare. Used if csv_path is not provided.
        method (str): 'simple', 'levenshtein', 'damerau', or 'all' (default).
        level (str): 'c' for character, 'w' for word (default 'c').
        
    Returns:
        results (list): List of tuples (text1, text2, {distance results})
    """
    # Load texts from CSV if path is given
        # Process pairs

    try:
        with open(csv_path, newline='', encoding='utf-8') as f:
            reader = csv.reader(f)
            rows = list(reader)
        # Flatten if single column, else treat as pairs
        if len(rows[0]) == 1:
            texts = [row[0] for row in rows]
            pairs_csv = [(texts[i], texts[j]) for i in range(len(texts)) for j in range(i+1, len(texts))]
        else:
            pairs_csv = [(row[0], row[1]) for row in rows]
        outputs = _pair_texts(pairs_csv, method, level)
    except (FileNotFoundError, ValueError):
        raise FileNotFoundError(f"CSV file not found or invalid format: {csv_path}") 
    return outputs
    
# Example usage:
results = batch_edit_distance(csv_path='../data/input_text_pairs.csv', method='all', level='c')

outputfile_path = '../data/output_scores.csv'

with open(outputfile_path, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Text1', 'Text2', 'Simple', 'Levenshtein', 'Damerau-Levenshtein'])     # Write header
    writer.writerows(results) 