In [None]:
import pandas as pd

In [None]:
gazetteer_df = pd.read_csv('./symptemist_gazetter_snomed_ES_v2.tsv', sep='\t', encoding='utf8')

In [None]:
codes_df = gazetteer_df['code'].value_counts()
codes_to_augment = list(map(str, list(codes_df[codes_df < 5].index)))

In [None]:
import random

def modify_string(input_string, add_prob=0.3, remove_prob=0.3, max_replace_ratio=None):
    result = list(input_string)
    
    # Define the set of characters that can be added
    spanish_characters = "abcdefghijklmnopqrstuvwxyzáéíóúñü"
    
    if max_replace_ratio is not None:
        max_replace = int(len(result) * max_replace_ratio)
    else:
        max_replace = None

    for i in range(len(result)):
        if max_replace is not None and max_replace <= 0:
            break
        
        if random.random() < add_prob:
            # Add a random Spanish character
            index = random.randint(0, len(result))
            result.insert(index, random.choice(spanish_characters))
            if max_replace is not None:
                max_replace -= 1
        
        if random.random() < remove_prob and len(result) > 1:
            # Remove a character
            index = random.randint(0, len(result) - 1)
            result.pop(index)
            if max_replace is not None:
                max_replace -= 1
    
    return ''.join(result)

In [None]:
augmented_df = gazetteer_df.copy()

In [None]:
from tqdm.auto import tqdm

def generate_rows(row):
    global augmented_df
    if str(row['code']) in codes_to_augment:
        to_add = []
        for j in range(5):
            a_row = row.copy()
            a_row['term'] = modify_string(row['term'], add_prob=0.2, remove_prob=0.2, max_replace_ratio=0.2)
            to_add.append(a_row)
        if to_add:
            augmented_df = pd.concat([augmented_df, pd.DataFrame(to_add)], ignore_index=True)

In [None]:
tqdm.pandas()

gazetteer_df.progress_apply(generate_rows, axis=1)

In [None]:
augmented_df['code'] = augmented_df['code'].transform(lambda v: str(v))

In [None]:
augmented_df[augmented_df['code'].isin(codes_to_augment)]['code'].value_counts()

In [None]:
augmented_df[['code', 'term']].to_csv('ALL_gazetteer_augmented_all_codes_5_no_header.tsv', encoding='utf8', sep='\t', header=None, index=False)