In [None]:
# default_exp token_emb
# all_data

In [None]:
# hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Token embeddings

## Imports

In [None]:
from flair.data import Sentence

In [None]:
from flair.embeddings import WordEmbeddings

In [None]:
from flair.embeddings import TransformerWordEmbeddings

In [None]:
from torch.nn import CosineSimilarity

### Load language model

In [None]:
MODEL = TransformerWordEmbeddings('roberta-base')

## Pre-processing

### Class-less version

In [None]:
LEX = 'bank'

In [None]:
sents_strings = [
    'He found money at the bank.', 
    'She works at a bank.', 
    "The boat reached the river's bank."
]

In [None]:
# export
def proc_sentence(sent_string):
    sent = Sentence(sent_string)
    return sent

In [None]:
sent_string = "I can't do this."
sent = proc_sentence(sent_string)

In [None]:
assert len(sent) == 6

In [None]:
for tok in sent:
    print(tok, tok.embedding)

In [None]:
def embed_sentence(sent, MODEL):
    MODEL.embed(sent)
    return sent

In [None]:
sent_emb = embed_sentence(sent, MODEL)

In [None]:
for tok in sent_emb:
    print(tok, tok.embedding.shape)

In [None]:
assert len(sent_emb) == 6

In [None]:
assert sent_emb[0].embedding.shape[0] == 768

In [None]:
def get_sent_toks(sent, lex):
    sent_toks = []
    for tok in sent:
        if tok.text == lex:
            sent_toks.append(tok)
    return sent_toks

In [None]:
lex = 'cell'
sent = proc_sentence('A cell is a cell.')
sent_emb = embed_sentence(sent, MODEL)
sent_toks = get_sent_toks(sent_emb, lex)

In [None]:
assert len(sent_toks) == 2

In [None]:
def get_sent_vec(sent_toks, lex):
    sent_toks_vecs = [tok.embedding for tok in sent_toks]
    # return vector only for the first token in the sentence, ignore the rest; alternative: take mean of all target tokens
    sent_vec = sent_toks_vecs[0]
    return sent_vec

In [None]:
assert get_sent_vec(sent_toks, lex).shape[0] == 768

In [None]:
def get_cos_sim(vec_1, vec_2):
    cos_sim = CosineSimilarity(dim=0)
    sim_vecs = cos_sim(vec_1, vec_2)
    return float(sim_vecs)

In [None]:
def get_sent_sim(sent_1, sent_2, lex):
    sent_vec_1 = get_sent_vec_
    cos_sim = get_cos_sim(sent_vec_1, sent_vec_2)

### Class version

#### Imports

In [None]:
from fastcore.basics import patch

#### Variables

In [None]:
lex = 'bank'

In [None]:
sents_strings = [
    'He found money at the bank.', 
    'She works at a bank.', 
    "The boat reached the river's bank."
]

In [None]:
sent_string = sents_strings[0]

#### Initialize `Sent` class

In [None]:
class Sent():
    def __init__(self, string, model):
        self.string = string
        self.model = model

In [None]:
sent = Sent(sent_string, MODEL)

In [None]:
sent.string

'He found money at the bank.'

#### Tokenize sentence

In [None]:
@patch(as_prop=True)
def sent(self: Sent):
    return Sentence(self.string)

In [None]:
for tok in sent.sent:
    print(tok)

Token: 1 He
Token: 2 found
Token: 3 money
Token: 4 at
Token: 5 the
Token: 6 bank
Token: 7 .


In [None]:
assert len(sent.sent) == 7

#### Embed sentence

In [None]:
@patch
def get_sent_emb(self: Sent):
    sent_tmp = self.sent
    self.model.embed(sent_tmp)
    return sent_tmp

In [None]:
sent = Sent(sent_string, MODEL)

In [None]:
sent_emb = sent.get_sent_emb()

In [None]:
for tok in sent_emb:
    print(tok, tok.embedding.shape)

Token: 1 He torch.Size([768])
Token: 2 found torch.Size([768])
Token: 3 money torch.Size([768])
Token: 4 at torch.Size([768])
Token: 5 the torch.Size([768])
Token: 6 bank torch.Size([768])
Token: 7 . torch.Size([768])


In [None]:
assert len(sent_emb) == 7

In [None]:
assert sent_emb[0].embedding.shape[0] == 768

#### Get target tokens

In [None]:
def get_toks_tgt(sent, lex):
    sent_toks = []
    for tok in sent:
        if tok.text == lex:
            sent_toks.append(tok)
    return sent_toks

In [None]:
sent_toks_tgt = get_toks_tgt(sent.sent, lex)

In [None]:
assert len(sent_toks_tgt) == 1

#### Tokens

In [None]:
sent_toks_tgt

[Token: 6 bank]

In [None]:
sent.string

'He found money at the bank.'

In [None]:
class Tok:
    def __init__(self, Token, Sent):
        self.text = Token.text
        self.context = Sent.string
        self.embedding = Token.embedding

In [None]:
for tok in sent_emb.sent:
    tok = Tok(tok, sent)
    print(tok.text, tok.context, tok.embedding)

AttributeError: 'Sentence' object has no attribute 'sent'

In [None]:
for tok in toks:
    print(tok, tok.embedding)

Token: 6 bank tensor([])


In [None]:
def get_sent_vec(sent_toks, lex):
    sent_toks_vecs = [tok.embedding for tok in sent_toks]
    # return vector only for the first token in the sentence, ignore the rest; alternative: take mean of all target tokens
    sent_vec = sent_toks_vecs[0]
    return sent_vec

In [None]:
assert get_sent_vec(sent_toks, lex).shape[0] == 768

NameError: name 'sent_toks' is not defined

In [None]:
def get_cos_sim(vec_1, vec_2):
    cos_sim = CosineSimilarity(dim=0)
    sim_vecs = cos_sim(vec_1, vec_2)
    return float(sim_vecs)

In [None]:
def get_sent_sim(sent_1, sent_2, lex):
    sent_vec_1 = get_sent_vec_
    cos_sim = get_cos_sim(sent_vec_1, sent_vec_2)

## Type-based embeddings

In [None]:
embedding = WordEmbeddings('glove')

In [None]:
sentence = Sentence('I can can a can.')
embedding.embed(sentence)

In [None]:
for token in sentence:
    print(token)
    print(token.embedding.shape)

## Token-based embeddings

### Toy example: 'bank'

In [None]:
sents = [proc_sentence(sent) for sent in sents]

In [None]:
sent_vecs = [get_sent_vec(sent, lex) for sent in sents]

In [None]:
sent_vecs[0].shape

In [None]:
len(sent_vecs)

In [None]:
get_cos_sim(sent_)

In [None]:
print(
    cos(vecs[0], vecs[1]),
    cos(vecs[0], vecs[2]),
    cos(vecs[1], vecs[2]),    
       )

In [None]:
assert cos(vecs[0], vecs[1]) > cos(vecs[0], vecs[2])
assert cos(vecs[0], vecs[1]) > cos(vecs[1], vecs[2])

### 'Cell'

In [None]:
lex = 'cell'

In [None]:
comments = [
    'I bought a new cell phone yesterday.',
    'Stem cell research has found a cure for cancer.'
]

In [None]:
sents = [Sentence(comm) for comm in comments]

In [None]:
sents_embed = embedding.embed([sent for sent in sents])

In [None]:
vecs = [tok.embedding for tok in toks]

In [None]:
for tok in Sentence(comments[0]):
    if tok.text == 'cell':
        print(tok, tok.embedding)

### 'Anglo-Saxon' 

In [None]:
from flair.data import Sentence

In [None]:
from flair.embeddings import TransformerWordEmbeddings

In [None]:
from random import choices

In [None]:
txt = df.body[0]

In [None]:
txt = "This is a sentence. This is another sentence. I love Berlin."

In [None]:
splitter = SegtokSentenceSplitter()

In [None]:
sents = splitter.split(txt)

print(len(sents), '\n')

for sent in sents:
    print(len(sent))

In [None]:
random.choices(sents, k=3)

In [None]:
random.choices(df['body'], k=3)

In [None]:
comments = [
    'At the same time, as the Anglo Saxons began settling in Roman territory, they began adopting vulgar Latin words into their vocabulary.',
    'Islamophobia here tends to refer to the racism from the WASP (white Anglo-Saxon Protestant)-types.'
]

In [None]:
import re

rep = {"anglo saxon": "anglo-saxon", "anglo-saxons": "anglo-saxon"} 
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))


In [None]:
[pattern.sub(lambda m: rep[re.escape(m.group(0))], comment) for comment in comments]

In [None]:
comments_sents = [splitter.split(comment) for comment in comments]

print(len(comments_sents), '\n')

for comment in comments_sents:
    print(len(comment))
    for sent in comment:
        print(sent)


In [None]:
sent.replace()

In [None]:
comments_low = [comm.lower() for comm in comments]

In [None]:
lex = 'Anglo-Saxon'
patterns = ['Anglo-Saxon', 'Anglo Sa']

comments_tokmerge

In [None]:
sents = [Sentence(txt) for txt in txts]

In [None]:
sents_toks = []
for sent in sents:
    toks = 0
    for tok in sent:
        if tok.text == 'Anglo-Saxon':
            toks += 1
    if toks >= 1:
        print(sent)

In [None]:
random.choices(sents, k=3)

In [None]:
sent = 'Anglo-Saxon Anglo Saxon people'
sent = Sentence(sent)

for tok in sent:
    if tok.text == 'Anglo-Saxon':
        print(tok)