In [1]:
from editdistance import eval
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
# Load good names
good_names_df = pd.read_csv('csv/good_names_1.csv').dropna()
good_names = good_names_df['good'].tolist()

# Load texts
text_df = pd.read_csv('csv/metadata_text_merged.csv')

# lowercasing
text_df['text_cleaned'] = text_df['text'].apply((lambda x: " ".join(x.lower() for x in x.split())))
# remove puctuation
text_df['text_cleaned'] = text_df['text_cleaned'].str.replace('[^\w\s]','')

# Check for words that are an edit distance of 2 away in the text

In [25]:
# Initialize dictionary of aliases, where key is good name and value is list of aliases
alias_dict = {}

for good_name in good_names:
    alias_dict[good_name] = []

for i, row in text_df.iterrows():
    text_list = row['text_cleaned'].split()
    uid = row['UID']
    
    # Compute edit distance with every good
    for good_name in good_names:
        for word in text_list:
            edit_dist = int(eval(good_name, word))
            if edit_dist <= 2 and word != good_name and word not in [tup[0] for tup in alias_dict[good_name]]:
                # Add to list of possible aliases for the good
                alias_dict[good_name].append((word, uid))

# Create dataframe of candidates of edit distance 2 away

In [26]:
edit_dist_df = pd.DataFrame(columns=['good', 'possible_alias', 'UID'])

for good_name, possible_aliases in tqdm_notebook(alias_dict.items()):
    for possible_alias in possible_aliases:
        edit_dist_df = edit_dist_df.append({'good': good_name, 
                                            'possible_alias': possible_alias[0],
                                            'UID': possible_alias[1]},
                                               ignore_index=True
                                            )

A Jupyter Widget




In [28]:
edit_dist_df.sort_values(by=['good', 'possible_alias']).to_csv('csv/good_names_edit_distance2.csv', index=False)