In [1]:
def edit_distance(str1, str2):
    # Convert both strings to lower case to ignore case differences
    str1, str2 = str1.lower(), str2.lower()

    # Create a table to store results of subproblems
    dp = [[0 for x in range(len(str2) + 1)] for x in range(len(str1) + 1)]

    # Fill dp[][] in bottom up manner
    for i in range(len(str1) + 1):
        for j in range(len(str2) + 1):

            # If first string is empty, only option is to insert all characters of second string
            if i == 0:
                dp[i][j] = j

            # If second string is empty, only option is to remove all characters of first string
            elif j == 0:
                dp[i][j] = i

            # If last characters are the same, ignore last character and recur for remaining string
            elif str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]

            # If last character is different, consider all possibilities and find minimum
            else:
                dp[i][j] = 1 + min(dp[i][j - 1],        # Insert
                                   dp[i - 1][j],        # Remove
                                   dp[i - 1][j - 1])    # Replace

    return dp[len(str1)][len(str2)]

In [255]:
class Name:
    def __init__(self):
        self.first_name = None
        self.last_name = None
        self.full_name = None
        self.middle_name = None
        self.variants = []
        self.index = None


    def str_compare(self, s1, s2):
        if s1 == s2:
            return 1
        elif edit_distance(s1, s2) < 2:
            return 0.5
        else: 
            return 0


    def is_same(self, s, category):
        s = s.lower()
        target = None

        
        if category == 'first':
            if self.first_name == None:
                return 0  
            target = self.first_name.lower()
            
            return self.str_compare(s, target)


        if category == 'last':
            if self.last_name == None:
                return 0  
            target = self.last_name.lower()

            return self.str_compare(s, target)

        if category == 'full':
            test_name = Name()
            test_name.fill(s, category)

            first = self.is_same(test_name.first_name, 'first')
            last = self.is_same(test_name.last_name, 'last')

            if (first == 0 and self.first_name != None) or (last == 0 and self.last_name != None):
                return 0
            else:
                return (first + last) / 2

    def complete(self):       
        if self.first_name != None and self.last_name != None:
            if self.full_name == None:
                self.full_name = self.first_name + ' ' + self.last_name
            
            elif len(self.full_name) < len(self.first_name + ' ' + self.last_name):
                self.full_name = self.first_name + ' ' + self.last_name
                return
            
        elif self.first_name != None:
            self.full_name = self.first_name
            
        elif self.last_name != None:
            self.full_name = self.last_name
        
            
    def fill(self, s, category):
        # Split the input string into tokens
        tokens = s.split()

        if category == 'full':
            # Use the first token as the first name
            if self.first_name == None:
                self.first_name = tokens[0]
            # Use longer form of name when possible
            elif len(tokens[0]) > len(self.first_name):
                self.first_name = tokens[0]
                

            # Determine the last name
            if len(tokens) > 1:
                last_name_tokens = tokens[1:]

                if len(last_name_tokens) > 1:
                    # Check if the second to last token is short (<= 3 characters)
                    if len(last_name_tokens[-2]) <= 3 and len(last_name_tokens) > 1:
                        last_name_tokens[-2] += " " + last_name_tokens[-1]
                        last_name = " ".join(last_name_tokens[:-1])
                    else:
                        last_name = last_name_tokens[-1]
    
                else:
                    last_name = last_name_tokens[-1]
                
            if self.last_name == None and last_name != "":
                self.last_name = last_name
                
            # Use longer form of name when possible   
            elif self.last_name != None and len(last_name) > len(self.last_name):
                self.last_name = last_name

        elif category == 'first':

            if self.first_name == None:
                self.first_name = s
            # Use longer form of name when possible
            elif len(s) > len(self.first_name):
                self.first_name = s

        elif category == 'last':

            if self.last_name == None:
                self.last_name = s
            # Use longer form of name when possible
            elif len(s) > len(self.last_name):
                self.last_name = s

        else:
            raise ValueError("Invalid category. Use 'full', 'first', or 'last'.")

        self.complete()

        return str(self.first_name) + ' ' + str(self.last_name)
            
        
        

In [256]:
j = Name()

In [257]:
j.fill('Rosy', 'first')
j.fill('Ronald', 'last')

'Rosy Ronald'

In [258]:
j.full_name

'Rosy Ronald'

In [259]:
j.is_same('Ronal','last')

0.5

In [260]:
import re
import extraction as ex

def mask_type(mask):
    if "FULL" in mask: return 'full'
    if "DOB" in mask: return 'dob'
    if "FIRST" in mask: return 'first'
    if "LAST" in mask: return 'last'

def change_index(mask, ind):
    pattern = r'\d|X'
    res = re.sub(pattern, str(ind), mask, count=1)
    return res
    

In [261]:
input_string = "Ethan Wang, Jerry Cain was born, Wang, on July 12, 1996"
masked_string = "[[FULL_NAME_X]], [[FULL_NAME_X]] was born, [[LAST_NAME_X]], on [[DOB]]"

p = ex.extract_masked_info(input_string, masked_string)
p

[('[[FULL_NAME_X]]', 'Ethan Wang'),
 ('[[FULL_NAME_X]]', 'Jerry Cain'),
 ('[[LAST_NAME_X]]', 'Wang'),
 ('[[DOB]]', 'July 12, 1996')]

In [262]:
def rectify(mask_pairs):
    max_ind = 1
    names = []

    # repeat this procedure to connect up first and last names
    for _ in range(2):
        
        for i in range(len(mask_pairs)):
            
            mask, content = mask_pairs[i]
            cur_type = mask_type(mask)
    
            if cur_type == 'dob': continue
    
            should_add = 1
    
            for name in names:
                if name.is_same(content, cur_type) > 0:
                    new_mask = change_index(mask, name.index)
                    mask_pairs[i] = (new_mask, content)
                    name.fill(content, cur_type)
                    should_add = 0
                    break
                
            if should_add == 1:
                new_name = Name()
                new_name.fill(content, cur_type)
                new_name.index = max_ind
                max_ind += 1
                names.append(new_name)
                
    return mask_pairs

In [263]:
rectify(p)

jerry cain Ethan Wang
0 0
False False
SAME
ethan wang Ethan Wang
1 1
False False
SAME
jerry cain Ethan Wang
0 0
False False
jerry cain Jerry Cain
1 1
False False
SAME
SAME


[('[[FULL_NAME_1]]', 'Ethan Wang'),
 ('[[FULL_NAME_2]]', 'Jerry Cain'),
 ('[[LAST_NAME_1]]', 'Wang'),
 ('[[DOB]]', 'July 12, 1996')]

In [264]:
import pandas as pd

df = pd.read_csv('even_more_labels.csv')

In [265]:
input, masked = df.iloc[5100].Unmasked, df.iloc[5100].Masked

In [266]:
masked = "[[FIRST_NAME_1]], [[FIRST_NAME_2]] and the girls, [[FIRST_NAME_1]] [[FIRST_NAME_2]] was his name, [[FIRST_NAME_1]] born -- [[DOB]], and [[FULL_NAME_2]] ([[DOB]]). [[LAST_NAME_2]] and Mr. [[LAST_NAME_2]] the baker. [[LAST_NAME_2]] was not related to [[FULL_NAME_2]]"
input = 'Charlie, Marnie and the girls, Charlie Byun was his name, Byun born -- 12/21/2002, and Marnie Martindale (June 22, 1995). Martindale and Mr. Godfried the baker.  Godfried was not related to Marnie Martindale'

In [267]:
input

'Charlie, Marnie and the girls, Charlie Byun was his name, Byun born -- 12/21/2002, and Marnie Martindale (June 22, 1995). Martindale and Mr. Godfried the baker.  Godfried was not related to Marnie Martindale'

In [268]:
masked

'[[FIRST_NAME_1]], [[FIRST_NAME_2]] and the girls, [[FIRST_NAME_1]] [[FIRST_NAME_2]] was his name, [[FIRST_NAME_1]] born -- [[DOB]], and [[FULL_NAME_2]] ([[DOB]]). [[LAST_NAME_2]] and Mr. [[LAST_NAME_2]] the baker. [[LAST_NAME_2]] was not related to [[FULL_NAME_2]]'

In [269]:
p = ex.extract_masked_info(input, masked)
p

[('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_2]]', 'Marnie'),
 ('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_2]]', 'Byun'),
 ('[[FIRST_NAME_1]]', 'Byun'),
 ('[[DOB]]', '12/21/2002'),
 ('[[FULL_NAME_2]]', 'Marnie Martindale'),
 ('[[DOB]]', 'June 22, 1995'),
 ('[[LAST_NAME_2]]', 'Martindale'),
 ('[[LAST_NAME_2]]', 'Godfried'),
 ('[[LAST_NAME_2]]', ' Godfried'),
 ('[[FULL_NAME_2]]', 'Marnie Martindale')]

In [270]:
p = rectify(p)
p

SAME
SAME
marnie martindale Charlie None
0 0
False False
marnie martindale Marnie None
1 0
False False
SAME
SAME
SAME
marnie martindale Charlie None
0 0
False False
marnie martindale Marnie Martindale
1 1
False False
SAME
SAME
SAME
SAME
SAME
SAME
marnie martindale Charlie None
0 0
False False
marnie martindale Marnie Martindale
1 1
False False
SAME
SAME
SAME
SAME
marnie martindale Charlie None
0 0
False False
marnie martindale Marnie Martindale
1 1
False False
SAME


[('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_2]]', 'Marnie'),
 ('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_3]]', 'Byun'),
 ('[[FIRST_NAME_3]]', 'Byun'),
 ('[[DOB]]', '12/21/2002'),
 ('[[FULL_NAME_2]]', 'Marnie Martindale'),
 ('[[DOB]]', 'June 22, 1995'),
 ('[[LAST_NAME_2]]', 'Martindale'),
 ('[[LAST_NAME_4]]', 'Godfried'),
 ('[[LAST_NAME_4]]', ' Godfried'),
 ('[[FULL_NAME_2]]', 'Marnie Martindale')]

In [271]:
p = ex.extract_masked_info(input, masked)
p = rectify(p)
replacements = [pair[0] for pair in p]
masked = ex.replace_masks(masked, replacements)
r = ex.recover(masked, p)
r == input

SAME
SAME
marnie martindale Charlie None
0 0
False False
marnie martindale Marnie None
1 0
False False
SAME
SAME
SAME
marnie martindale Charlie None
0 0
False False
marnie martindale Marnie Martindale
1 1
False False
SAME
SAME
SAME
SAME
SAME
SAME
marnie martindale Charlie None
0 0
False False
marnie martindale Marnie Martindale
1 1
False False
SAME
SAME
SAME
SAME
marnie martindale Charlie None
0 0
False False
marnie martindale Marnie Martindale
1 1
False False
SAME


True

In [22]:
p

[('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_2]]', 'Marnie'),
 ('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_3]]', 'Byun'),
 ('[[FIRST_NAME_3]]', 'Byun'),
 ('[[DOB]]', '12/21/2002'),
 ('[[FULL_NAME_4]]', 'Marnie Martindale'),
 ('[[DOB]]', 'June 22, 1995'),
 ('[[LAST_NAME_4]]', 'Martindale'),
 ('[[LAST_NAME_5]]', 'Godfried'),
 ('[[LAST_NAME_5]]', ' Godfried'),
 ('[[FULL_NAME_4]]', 'Marnie Martindale')]

In [18]:
p = ex.extract_masked_info(input, masked)
p

[('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_2]]', 'Marnie'),
 ('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_2]]', 'Byun'),
 ('[[FIRST_NAME_1]]', 'Byun'),
 ('[[DOB]]', '12/21/2002'),
 ('[[FULL_NAME_2]]', 'Marnie Martindale'),
 ('[[DOB]]', 'June 22, 1995'),
 ('[[LAST_NAME_2]]', 'Martindale'),
 ('[[LAST_NAME_2]]', 'Godfried'),
 ('[[LAST_NAME_2]]', ' Godfried'),
 ('[[FULL_NAME_2]]', 'Marnie Martindale')]

In [19]:
rectify(p)

[('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_2]]', 'Marnie'),
 ('[[FIRST_NAME_1]]', 'Charlie'),
 ('[[FIRST_NAME_3]]', 'Byun'),
 ('[[FIRST_NAME_3]]', 'Byun'),
 ('[[DOB]]', '12/21/2002'),
 ('[[FULL_NAME_4]]', 'Marnie Martindale'),
 ('[[DOB]]', 'June 22, 1995'),
 ('[[LAST_NAME_4]]', 'Martindale'),
 ('[[LAST_NAME_5]]', 'Godfried'),
 ('[[LAST_NAME_5]]', ' Godfried'),
 ('[[FULL_NAME_4]]', 'Marnie Martindale')]