In [None]:
import pandas as pd
import re, unicodedata
from evaluate import *
import spacy
from spacy.matcher import Matcher
from extractors.entity import Entity
import truecase

def preprocess(text):
    """
        Preprocesses the text: expanding contractions, removing emojis and punctuation marks
    Args:
        text (str): the text to be preprocessed
    Returns:
        str: the text after being preprocessed
    """
    CONTRACTION_MAP = {
        'names': 'name is',
        'its': 'it is',
        "I'm": "I am",
        "i'm": "I am",
        "name's": "name is",
        "it's": "it is",  
        "I've":"I have",
        "i've": "I have",
        "we've":'We have'
    }

    EMOJI_PATTERN = re.compile(
        "["
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+"
    )

    def replace_contraction(text):
        contractions_pattern = re.compile('({})'.format('|'.join(CONTRACTION_MAP.keys())), 
                                        flags=re.IGNORECASE)
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = CONTRACTION_MAP.get(match)\
                                    if CONTRACTION_MAP.get(match)\
                                    else CONTRACTION_MAP.get(match.lower())                       
            expanded_contraction = first_char+expanded_contraction[1:]
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text


    # return empty string if text is NaN
    if type(text)==float:
        return ''
    # remove emoji
    text = re.sub(EMOJI_PATTERN, r' ', text)
    text = re.sub(r'·', ' ', text)
    # convert non-ASCII characters to utf-8
    text = unicodedata.normalize('NFKD',text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'<.*?>', ' ', text)
    text = replace_contraction(text)
    text = re.sub(r'[\'·\"”#$%&’()*+/:;<=>@[\]^_`{|}~-]+',' ',text)
    text = re.sub(r'[!,.?]{2,}\s?',' ',text)
    text = re.sub(r'[\s]+',' ',text)
    text = truecase.get_true_case(text)
    return text

### Plan
1. mask
2. context+word vector -> knn

In [None]:
import torch
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [None]:
tokenizer = ByteLevelBPETokenizer(
    "ht_bert/vocab.json",
    "ht_bert/merges.txt",
    # model_max_length = 120,  
)

tokenizer_v2 = ByteLevelBPETokenizer(
    "ht_bert_v2/vocab.json",
    "ht_bert_v2/merges.txt",
    # model_max_length = 120,  
)

tokenizer_v3 = ByteLevelBPETokenizer(
    "ht_bert_v3/vocab.json",
    "ht_bert_v3/merges.txt",
    # model_max_length = 120,  
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

tokenizer_v2._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer_v2.token_to_id("</s>")),
    ("<s>", tokenizer_v2.token_to_id("<s>")),
)
tokenizer_v2.enable_truncation(max_length=512)

tokenizer_v3._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer_v3.token_to_id("</s>")),
    ("<s>", tokenizer_v3.token_to_id("<s>")),
)
tokenizer_v3.enable_truncation(max_length=512)

# print(tokenizer.encode("Hi im Emma, tonight my sexy friend Brielle will be joing the fun."))

# print(tokenizer.encode("Hi im Emma, tonight my sexy friend Brielle will be joing the fun.").tokens)

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="ht_bert",
    tokenizer="ht_bert",
    top_k=15,
)

fill_mask_v2 = pipeline(
    "fill-mask",
    model="ht_bert_v2",
    tokenizer="ht_bert_v2",
    top_k=15,
)

fill_mask_v3 = pipeline(
    "fill-mask",
    model="ht_bert_v3",
    tokenizer="ht_bert_v3",
    top_k=15,
)

In [None]:
fill_mask_v3("Hi my name is <mask>")

In [None]:
import pandas as pd
from extractors.name_extractor import NameExtractor

df = pd.read_csv('data/crowdsource_filtered_ne_roberta_2.tsv', sep='\t')
# df = pd.read_csv('data_m/CanadaMax80_results.tsv', sep='\t')
name_df = pd.read_csv('extractors/src/nameslist.csv')
name_set = set([name.lower().strip() for name in name_df['Name']])
ne = NameExtractor(primary=['dict'], backoff=['rule'])

In [None]:
len(df)

In [None]:
for t, d in zip(df['title'], df['description']):
    print(t)
    print(d)
    print()
    print('='*100)

In [None]:
# compute the ratio in the result
def compute_ratio(fill_mask_sim, ne_result):
    in_dict_counter = 0
    total = len(fill_mask_sim)
    for r in fill_mask_sim:
        if r['token_str'].strip('Ġ').lower() in name_set:
            in_dict_counter += 1
        if r['token_str'].strip('Ġ').lower() == ne_result:
            total -= 1
            in_dict_counter -= 1
            

    return in_dict_counter / total

In [None]:
def compute_sigma(fill_mask_result, word):
    scores = [result['score'] for result in fill_mask_result]
    
    return np.std(np.array(scores))

def compute_range(fill_mask_result, word):
    scores = [result['score'] for result in fill_mask_result]
    
    return max(scores) - min(scores)

In [None]:
from spacy.lang.en import English

# NLP = English()
# sentencizer = NLP.create_pipe("sentencizer")
# NLP.add_pipe(sentencizer)

def disambiguate_layer(context, entities, fill_mask, window_size=5):
    '''
    context: the sentence that include the query word
    entities: results from the extractor, a list of entities

    return: a list of entities with modified fill_mask_conf and fill_mask_std fields
    '''
    # sanity check
    if not entities:
        return []
    # preprocess the context, docode
    cleaned_context = preprocess(context)

    results = []
    for ent in entities:
        info_dict = {}
        word = ent.text
        info_dict['word'] = word.lower()

        # select the context window for the word
        context_list = cleaned_context.lower().split()
        try:
            word_idx = context_list.index(word.lower())
        except:
#             print(context_list, word)
            return {}
        window = ' '.join(context_list[max(0,word_idx-window_size):min(len(context_list), word_idx+window_size)])
        
        window = window.replace(word.lower(), '<mask>',1)
        info_dict['context'] = window

        fill_mask_sim = fill_mask(window)
        ratio = compute_ratio(fill_mask_sim, word.lower())
        info_dict['ratio'] = ratio
        results.append(info_dict)
        
        ent.fill_mask_conf = ratio
        ent.fill_mask_std = (compute_sigma(fill_mask_sim, word), compute_range(fill_mask_sim, word))

    return entities

### an experiment 
Goal: to check if the standard deviation of true names and non-names are different (distribution and avg)
attr: score, std, max, (min)
Step: 
1. split  the dataset randomly into 80/20
2. compute the sigma for each candidates extracted by the extarctor, store in the std field
3. group the result by whether it's ture result

In [None]:
from ast import literal_eval
disambiguated_entities = []
for idx in range(len(df[:100])):
    cnt = df['description'][idx]+' '+df['title'][idx]
    result_entity = ne.extract(cnt)
    results = disambiguate_layer(cnt, result_entity, fill_mask)
    disambiguated_entities.append(results)

In [None]:
df.info()

In [None]:
# compare with the true names
correct = []  #  a list of correctly identified names
incorrect = []

for idx in range(len(df[:100])):
    true = literal_eval(df['True'][idx])
    sub_correct = [ent for ent in disambiguated_entities[idx] if ent.text.lower() in true]
    sub_incorrect = [ent for ent in disambiguated_entities[idx] if ent.text.lower() not in true]
    correct.append(sub_correct)
    incorrect.append(sub_incorrect)


In [None]:
print([ent.fill_mask_std for ents in correct for ent in ents])

In [None]:
print([ent.fill_mask_std for ents in incorrect for ent in ents])

In [None]:
import plotly.express as px
correct_data = {'text':[ent.text for ents in correct for ent in ents],
                'std':[ent.fill_mask_std[0] for ents in correct for ent in ents],
                'range':[ent.fill_mask_std[1] for ents in correct for ent in ents],
                'score':[ent.fill_mask_conf for ents in correct for ent in ents]}
fig = px.scatter(correct_data, x="std",y='score', hover_data=['text','std','score'])
fig.show()

In [None]:
incorrect_data = {'text':[ent.text for ents in incorrect for ent in ents],
                'std':[ent.fill_mask_std[0] for ents in incorrect for ent in ents],
                  'range':[ent.fill_mask_std[1] for ents in incorrect for ent in ents],
                'score':[ent.fill_mask_conf for ents in incorrect for ent in ents]}
# fig = px.scatter(incorrect_data, x="std",y='score', hover_data=['text','std', 'score'])
# fig.show()
total_data = {'text':correct_data['text']+incorrect_data['text'],
              'std': correct_data['std']+incorrect_data['std'],
              'range':correct_data['range']+incorrect_data['range'],
              'score': correct_data['score']+incorrect_data['score'],
             'label':['correct' for i in range(len(correct_data['text']))]+['incorrect' for i in range(len(incorrect_data['text']))]}

fig = px.scatter_matrix(total_data, dimensions=["std", "range", "score"],color='label')
fig.show()

### experiment 2 
check if the first returned prediction is the same as the word we want to predict

In [None]:
def check_top_pred(context, entities, fill_mask, window_size=5):
    '''
    context: the sentence that include the query word
    entities: results from the extractor, a list of entities

    return: a list of entities with modified fill_mask_conf and fill_mask_std fields
    '''
    # sanity check
    if not entities:
        return []
    # preprocess the context, docode
    cleaned_context = preprocess(context)

    results = []
    for ent in entities:
        info_dict = {}
        word = ent.text
        info_dict['word'] = word.lower()

        # select the context window for the word
        context_list = cleaned_context.lower().split()
        try:
            word_idx = context_list.index(word.lower())
        except:
#             print(context_list, word)
            return {}
        window = ' '.join(context_list[max(0,word_idx-window_size):min(len(context_list), word_idx+window_size)])
        
        window = window.replace(word.lower(), '<mask>',1)
        info_dict['context'] = window

        fill_mask_sim = fill_mask(window)
        # confidence score is set to be if the masked word is the same as the predicted word
        # 0: unlikely to be a name since the context is highly possible for a particular word
        if word in [fill_mask_sim[i]['token_str'].strip('Ġ').lower() for i in range(3)] :
            conf = 0
            print(window)
        else:
            conf = 1
        
        
        ent.fill_mask_conf = conf
        ent.fill_mask_std = (compute_sigma(fill_mask_sim, word), compute_range(fill_mask_sim, word))

    return entities

In [None]:
from ast import literal_eval
disambiguated_entities = []
for idx in range(len(df)):
    cnt = df['description'][idx]+' '+df['title'][idx]
    result_entity = ne.extract(cnt)
    results = check_top_pred(cnt, result_entity, fill_mask_v3)
    disambiguated_entities.append(results)

In [None]:
# inspect the entity if the confidence is 0
for ents, idx in zip(disambiguated_entities, range(len(df))):
    for ent in ents:
        if ent.fill_mask_conf==0:
            print('='*100)
            print(ent.text)
            cnt = df['description'][idx]+' '+df['title'][idx]
            print(cnt)
            print(ent.fill_mask_conf)
        

In [None]:
fill_mask_v3('.! hi my name is <mask> . . im sexy')

In [None]:
correct = []  #  a list of correctly identified names
incorrect = []

for idx in range(len(df[:100])):
    true = literal_eval(df['True'][idx])
    sub_correct = [ent for ent in disambiguated_entities[idx] if ent.text.lower() in true]
    sub_incorrect = [ent for ent in disambiguated_entities[idx] if ent.text.lower() not in true]
    correct.append(sub_correct)
    incorrect.append(sub_incorrect)

print([ent.fill_mask_conf for ents in incorrect for ent in ents])

## Current options for disambiguation
- ratio
- if top prediction == masked word
- std / range (doesn't perform well）
- vector similarity (still exploring)

### old code

In [None]:
from ast import literal_eval
filtered_results_v1 = []
for idx in range(len(df[:100])):
#     ne_result = literal_eval(df['dict'][idx])
    cnt = df['description'][idx]+' '+df['title'][idx]
    result_entity = ne.extract(cnt)
#     ne_result = [ent.text for ent in result_entity]
    results = disambiguate_layer(cnt, result_entity, fill_mask)
    filtered_result = []
    for result in results:
        if result['ratio']>0:
            # print(result['context'])
            # print(result['word'])
            # print('-'*80)
            filtered_result.append(result['word'])
    filtered_results_v1.append(filtered_result)
    
filtered_results_v2 = []
for idx in range(len(df[:100])):
#     ne_result = literal_eval(df['dict'][idx])
    cnt = df['description'][idx]+' '+df['title'][idx]
    result_entity = ne.extract(cnt)
#     ne_result = [ent.text for ent in result_entity]
    results = disambiguate_layer(cnt, result_entity, fill_mask_v2)
    filtered_result = []
    for result in results:
        if result['ratio']>0:
            # print(result['context'])
            # print(result['word'])
            # print('-'*80)
            filtered_result.append(result['word'])
    filtered_results_v2.append(filtered_result)

In [None]:
for idx, result in enumerate(filtered_results_v2):
    if result:
        print(df['description'][idx]+' '+df['title'][idx])
        print('1st bert model:', filtered_results_v1[idx])
        print('2nd bert model:', result)
        print()

In [None]:
for idx in range(100):
    cnt = df['description'][idx]+' '+df['title'][idx]
    result_entity = ne.extract(cnt)
    ne_result = [ent.text for ent in result_entity]
    
    if ne_result:
        print(cnt)
        print('Results from dict&rule extracotr:', ne_result)
        print('1st bert model:', filtered_results_v1[idx])
        print('2nd bert model:', filtered_results_v2[idx])
        print()

In [None]:
# f = fill_mask_v2('$180/hr Worth every <mask> Multi hr Specials!')
# f = fill_mask_v2('I am doing duo with <mask> great reviews on') #lilia
f = fill_mask_v2('THANKS for view :Im <mask>,friendly and happy latina') #karla
print(f)

In [None]:
compute_sigma(f, 'penny')

# LM-KNN

In [None]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config).from_pretrained("ht_bert_v2")

In [None]:
import torch.nn as nn

class Identity(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x

model.lm_head = Identity()

In [None]:
get_embeddings = pipeline(
    "feature-extraction",
    model=model,
    tokenizer="ht_bert_v3"
)

In [None]:
sent = 'Hi im Emma, tonight my sexy friend Brielle will be joing the fun.'
sents = ["Amazing outcall with a blonde Latina bombshell 647-470-6038 Carmalita",
         "Amazing outcall with a blonde Latina bombshell 647-470-6038 AMBER"]
# toks = tokenizer.encode(sents).tokens
embeddings = get_embeddings(sent)

In [None]:
len(embeddings[0][0])