## Bert - POC

In [8]:
import numpy as np
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

import os
import logging
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [9]:
class BertBaseMultilingualEmbeddingApi:
    
    def __init__(self, model_name="bert-base-multilingual-cased", cuda=True):
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name).to(self.device)
        self.model.eval()
        
        self.tokens_tensor = None
        self.segments_tensor = None
        
        self.encoded_layers_ = None
        self.token_embeddings_ = None
        
    def _tokenize_text(self, text):
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = self.tokenizer.tokenize(marked_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        
        self.tokens_tensor = torch.tensor([indexed_tokens]).to(self.device)
        self.segments_tensor = torch.tensor([segments_ids]).to(self.device)
    
    def _evaluate(self):
        with torch.no_grad():
            encoded_layers, _ = self.model(self.tokens_tensor, self.segments_tensor)
        self.encoded_layers_ = encoded_layers
    
    def _generate_token_embeddings(self, batch_i=0):
        """
        Convert the hidden state embeddings into single token vectors
        Holds the list of 12 layer embeddings for each token
        Will have the shape: [# tokens, # layers, # features]
        """
        token_embeddings = [] 
        # For each token in the sentence...
        for token_i in range(len(self.encoded_layers_[-1][batch_i])):
            # Holds 12 layers of hidden states for each token 
            hidden_layers = [] 
            # For each of the 12 layers...
            for layer_i in range(len(self.encoded_layers_)):
            # Lookup the vector for `token_i` in `layer_i`
                vec = self.encoded_layers_[layer_i][batch_i][token_i]
                hidden_layers.append(vec)
            token_embeddings.append(hidden_layers)
        self.token_embeddings_ = token_embeddings
    
    def feed_forward(self, sentence):
        self._tokenize_text(sentence)
        self._evaluate()
        self._generate_token_embeddings()
    
    def create_word_embedding_(self, how="cat_last_4"):
        if how == "cat_last_4":
            return [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in self.token_embeddings_]
        elif how == "sum_last_4":
            return [torch.sum(torch.stack(layer)[-4:], 0) for layer in self.token_embeddings_]
        else:
            print("Redefine `how` parameter")
        
    def create_sentence_embedding_(self, how="mean_last_layer"):
        if how == "mean_last_layer":
            return torch.mean(self.encoded_layers_[-1], 1).squeeze()
        elif how == "mean_cat_last_4_layers":
            return torch.mean(torch.cat((self.encoded_layers_[-1], self.encoded_layers_[-2], self.encoded_layers_[-3], self.encoded_layers_[-4]), 2), 1).squeeze()
            #return torch.mean(torch.stack(self.create_word_embedding_("cat_last_4")), 0)
        elif how == "mean_sum_last_4_layers": 
            return torch.mean(torch.sum(torch.stack(self.encoded_layers_[-4:]), 0), 1).squeeze()
        else:
            print("Redefine `how` parameter", how)
            
    def print_dimensions_(self):
        print ("Number of layers:", len(self.encoded_layers_))
        layer_i = 0
        print ("Number of batches:", len(self.encoded_layers_[layer_i]))
        batch_i = 0
        print ("Number of tokens:", len(self.encoded_layers_[layer_i][batch_i]))
        token_i = 0
        print ("Number of hidden units:", len(self.encoded_layers_[layer_i][batch_i][token_i]))
        
    def plot_embedding_hist(self, vec):
        sns.set_style("whitegrid")
        plt.figure(figsize=(12, 8))
        sns.distplot(vec)
        plt.show()

## Text Preprocessing

In [255]:
import codecs
from pprint import pprint
NUMBER_OF_FILES_TO_OPEN = 5013
CORPUS = "gazeta"

In [256]:
corpusNews = list()
for i in range(0,NUMBER_OF_FILES_TO_OPEN):
    try:
        f = codecs.open("doc/"+ CORPUS +"/doc"+str(i+1), "r", encoding = 'utf-8')
        lines = ""
        for line in f:
            lines += line
        corpusNews.append(lines)
        f.close()
    except FileNotFoundError:
        with open("doc/"+ CORPUS +"/doc"+str(i+1), "w", encoding = 'utf-8') as missing:
            pass
        corpusNews.append("")
        
pprint(corpusNews[0])
print(len(corpusNews))

('Tomasza <Entity name="Tomasz Sekielski" type="person" '
 'category="dziennikarze">Sekielskiego</Entity> oburzyła wypowiedź prof. '
 'Mariana Filara. Wybitny prawnik stwierdził, że nawet jeśli znalezione w celi '
 'Mariusza T. to prowokacja - postąpiono słusznie. - Nie wiem, co się stało '
 'profesorowi. Proponuje bardzo niebezpieczną drogę. Mariusz T. nigdy nie '
 'powinien wyjść na wolność. Ale nie można w tym celu łamać prawa - mówił '
 '<Entity name="Tomasz Sekielski" type="person" '
 'category="dziennikarze">Sekielski</Entity> w TOK FM. "Człowiek, który '
 'zdecydował o podrzuceniu Mariuszowi T. do celi kompromitujących materiałów, '
 'postawił na szali jedne wartości przeciw drugim. Postawił naruszenie prawa '
 'przeciw ludzkiemu bezpieczeństwu. W mojej ocenie wybrał słusznie" - tak w '
 'wywiadzie dla "Gazety Wyborczej" prof. <Entity name="Marian Filar" '
 'type="person" category="muzycy">Marian Filar</Entity> komentował informacje '
 'o znalezieniu w celi Mariusza T. materiałó

Below are functions from Piter, that I did not changed (but use it differently):

In [12]:
def find_annotations(document : str):
    '''
    Searches for all occurances of '<' and '>' in the document.
    Returns lists of indexes of occurances opening for '<' and closing for '>'
    '''
    i = 0
    opening = list()
    closing = list()
    while i != -1:
        i = document.find('<', i)
        opening.append(i)
        if i == -1:
            closing.append(-1)
            break
        i = document.find('>', i)
        closing.append(i)
    closing = [cl + 1 for cl in closing]
    closing[-1] = -1
    return(opening, closing)

def get_annotation_values(text : str): # jest typ nie type, bo type jest zarezerwowana nazwa, nie jestem uposledzony
    '''
    Returns a dict consisting annotation values {'name', 'typ', 'category'} for first occuring annotation
    in the text.
    '''
    name_start = text.find('name=') + len('name=\"')
    name_end = text.find('\"', name_start)
    typ_start = text.find('type=', name_end) + len('type=\"')
    typ_end = text.find('\"', typ_start)
    category_start = text.find('category', typ_end) + len('category=\"')
    category_end = text.find('\"', category_start)
    return({'name' : text[name_start:name_end], 'typ' : text[typ_start:typ_end],
               'category' : text[category_start:category_end]})

def exclude_vectors_nsize(text, nsize = 3):
    '''
    Parameters:
    text - document string
    nsize - size of window around person word 
    Returns a list of lists build as follows [k_words before person, person, k_words after person], person name]
    '''
    global people_dict
    opn, cls = find_annotations(text)
    ind = 0
    l = list()
    for i in range(0, len(opn) - 1, 2):
        left_sentence = last_n(text[ind:opn[i]].split(' '), nsize)
        left_sentence = repair_sentence(left_sentence, nsize, left = True)
        right_sentence = first_n(text[cls[i+1]:text.find('<', cls[i+1])].split(' ') , nsize)
        right_sentence = repair_sentence(right_sentence, nsize, left = False)
        annotation = get_annotation_values(text[ind:-1])
        l.append([flat_list([left_sentence, right_sentence]), annotation["name"]])
        try:
            people_dict[annotation["category"]].add(annotation["name"])
        except KeyError:
            people_dict[annotation["category"]] = {annotation["name"]}
        ind = cls[i+1]
    return l

def exclude_vectors_for_person(list_of_vectors, person):
    '''
    list_of_vectors - return value from exclude_sentence_vectors()
    person - name of person from people_dict
    '''
    l = list()
    for el in list_of_vectors:
        if el[1] == person:
            l.append(el[0])
    return l

More functions from Piter:

In [13]:
def reverse(text : str):
    '''
    Returns reversed text.
    '''
    return(text[::-1])

def last_n(_list, n):
    '''
    Returns last n elements of list. 
    Returns full list if n is greater than list length or empty string if list is empty
    '''
    if not _list:
        return('')
    if _list[len(_list) - 1] == '':
        _list.pop()
    return(_list[-n:])

def first_n(_list, n):
    '''
    Returns first n elements of list. 
    Returns full list if n is greater than list length or empty string if list is empty
    '''
    if not _list:
        return('')
    if _list[0] == '':
        _list.pop(0)
    return(_list[:n])


In [14]:
words_with_dot = ['m.in.', 'inż.', 'prof.', 'tzn.', 'np.', 'cd.', 'al.', 'cnd.', 
                  'itp.', 'itd.', 'lek.', 'lic.', 'pl.', 'p.o.', 'św.', 'tj.', 
                  'tzw.', 'ul.', 'zob.', 'ul.']

punctuation = ['"', ',', '.', ':', '(', ')', '?', '!']

In [15]:
def split_the_word(word = '"Pas.chanacz:lolo)u(marek,pies!?"'):
    '''
    Splits the word with elements in punctuation list. Words in words_with_dot are excluded from splitting.
    Returns a list of splitted word. Splitting characters are included in the list.
    '''
    global words_with_dot
    global punctuation
    if word in words_with_dot:
        return([word])
    else:
        l = list()
        index_start = 0
        index_end = 0
        for i, char in enumerate(word):
            if char in punctuation:
                if index_start != index_end:
                    l.append(word[index_start:index_end])
                l.append(char)
                index_start = i + 1
                index_end = i + 1
            else:
                index_end = i + 1
        if index_start != index_end:
            l.append(word[index_start:index_end])
        return(l)
    
def flat_list(_list): # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
    '''
    Create one list from list of lists.
    '''
    flat_list = []
    for sublist in _list:
        for item in sublist:
            flat_list.append(item)
    return(flat_list)
    
def repair_sentence(_list, nsize, left):
    '''
    Naprawia zdanie xD. Chodzi o to, zeby oddzielic znaki interpunkcyjne.
    '''
    global words_with_dot
    l = list()
    for el in _list:
        if el not in words_with_dot:
            l.append(split_the_word(el))
        else:
            l.append([el])
    if left:
        return(last_n(flat_list(l), nsize))
    else:
        return(first_n(flat_list(l), nsize))

In [16]:
people_dict = {}

def exclude_vectors_nsize(text, nsize = 3):
    '''
    Parameters:
    text - document string
    nsize - size of window around person word 
    Returns a list of lists build as follows [k_words before person, person, k_words after person], person name]
    '''
    global people_dict
    opn, cls = find_annotations(text)
    ind = 0
    l = list()
    for i in range(0, len(opn) - 1, 2):
        left_sentence = last_n(text[ind:opn[i]].split(' '), nsize)
        left_sentence = repair_sentence(left_sentence, nsize, left = True)
        right_sentence = first_n(text[cls[i+1]:text.find('<', cls[i+1])].split(' ') , nsize)
        right_sentence = repair_sentence(right_sentence, nsize, left = False)
        annotation = get_annotation_values(text[ind:-1])
        l.append([flat_list([left_sentence, right_sentence]), annotation["name"]])
        try:
            people_dict[annotation["category"]].add(annotation["name"])
        except KeyError:
            people_dict[annotation["category"]] = {annotation["name"]}
        ind = cls[i+1]
    return l, people_dict

And our function for sentences:

In [17]:
def exclude_sentence_vectors(text):
    people_dict = {}
    sentences = text.split(".")
    contexts = []
    for sentence in sentences:
        opn, cls = find_annotations(sentence)
        if opn[0] != -1:
            #print(opn, cls)
            annotation = get_annotation_values(sentence)
            sentence = sentence[0:opn[0]] + sentence[cls[-2]:]
            contexts.append([sentence, annotation["name"]])
            try:
                people_dict[annotation["category"]].add(annotation["name"])
            except KeyError:
                people_dict[annotation["category"]] = {annotation["name"]}
    return contexts, people_dict


## vectors, people_dict = exclude_sentence_vectors(corpusNews[0])
print(vectors)
print(people_dict)

In [18]:
vectors, people_dict = exclude_vectors_nsize(corpusNews[0])
print(vectors) 
print(people_dict)
print(exclude_vectors_for_person(vectors, "Tomasz Sekielski"))

[[['Tomasza', 'oburzyła', 'wypowiedź', 'prof.'], 'Tomasz Sekielski'], [['prawa', '-', 'mówił', 'w', 'TOK', 'FM'], 'Tomasz Sekielski'], [['Wyborczej', '"', 'prof.', 'komentował', 'informacje', 'o'], 'Marian Filar'], [['zaskoczyły', 'i', 'oburzyły', '.', 'Dziennikarz', 'TVP'], 'Tomasz Sekielski'], [['"', '.', 'Zdaniem', 'prawnik', 'z', 'UMK'], 'Tomasz Sekielski'], [['morderca-pedofil', '-', 'powiedział'], 'Tomasz Sekielski']]
{'dziennikarze': {'Tomasz Sekielski'}, 'muzycy': {'Marian Filar'}}
[['Tomasza', 'oburzyła', 'wypowiedź', 'prof.'], ['prawa', '-', 'mówił', 'w', 'TOK', 'FM'], ['zaskoczyły', 'i', 'oburzyły', '.', 'Dziennikarz', 'TVP'], ['"', '.', 'Zdaniem', 'prawnik', 'z', 'UMK'], ['morderca-pedofil', '-', 'powiedział']]


In [19]:
vectors = exclude_vectors_nsize(corpusNews[0], 4)
pprint(vectors)

([[['Tomasza', 'oburzyła', 'wypowiedź', 'prof.', 'Mariana'],
   'Tomasz Sekielski'],
  [['łamać', 'prawa', '-', 'mówił', 'w', 'TOK', 'FM', '.'], 'Tomasz Sekielski'],
  [['Gazety',
    'Wyborczej',
    '"',
    'prof.',
    'komentował',
    'informacje',
    'o',
    'znalezieniu'],
   'Marian Filar'],
  [['prawnika',
    'zaskoczyły',
    'i',
    'oburzyły',
    '.',
    'Dziennikarz',
    'TVP',
    'przyznał'],
   'Tomasz Sekielski'],
  [['FM', '"', '.', 'Zdaniem', 'prawnik', 'z', 'UMK', 'w'], 'Tomasz Sekielski'],
  [['tylko', 'morderca-pedofil', '-', 'powiedział'], 'Tomasz Sekielski']],
 {'dziennikarze': {'Tomasz Sekielski'}, 'muzycy': {'Marian Filar'}})


In [20]:
people_dict

{'dziennikarze': {'Tomasz Sekielski'}, 'muzycy': {'Marian Filar'}}

## Generating tsv files

In [98]:
NAME = "Name"
EMBEDDING = "embedding"
PROFESSION = "Profession"
TYPE = "mean_cat_last_4_layers"


In [249]:
def generate_embeddings_Piter(news_file):
    embeddings = []
    metadata = []
    vectors, people_dict = exclude_vectors_nsize(news_file, 4)
    for profession in people_dict:
        names = people_dict[profession]
        for name in names:
            person_contexts = exclude_vectors_for_person(vectors, name)
            for context in person_contexts:
                concatenated = " ".join(c for c in context)
                bert.feed_forward(concatenated)
                sent_embedding = bert.create_sentence_embedding_(how = TYPE)
                vec = pd.DataFrame(sent_embedding.cpu().numpy()).T
                embeddings.append(vec)
                metadata.append({NAME: name, PROFESSION: profession})
    return embeddings, metadata
        

In [248]:
def generate_embeddings(news_file):
    embeddings = []
    metadata = []
    vectors, people_dict = exclude_sentence_vectors(news_file)
    for profession in people_dict:
        names = people_dict[profession]
        for name in names:
            person_contexts = exclude_vectors_for_person(vectors, name)
            for context in person_contexts:
                bert.feed_forward(context)
                sent_embedding = bert.create_sentence_embedding_(how = TYPE)
                vec = pd.DataFrame(sent_embedding.cpu().numpy()).T
                embeddings.append(vec)
                metadata.append({NAME: name, PROFESSION: profession})
    return  embeddings, metadata
        

In [173]:
# def generate_tsv_files(filename, embeddings):
#     with open(filename + "_vectors.tsv",'w') as f:
#         for vector in embeddings:
#             for value in vector[EMBEDDING]:
#                 f.write("%e\t" % (value))
#             f.write("\n")
#     f.close()

#     with open(filename + "_metadata.tsv",'w') as f:
#         f.write("Name\tProfession\n")
#         for vector in embeddings:
#             f.write("%s\t%s\n" % (vector[NAME], vector[PROFESSION]))
#     f.close()

#generate_tsv_files("tsv_files/"+ TYPE +"/" + CORPUS +"/" + TYPE + "_doc"+str(i+1), embeddings)

In [42]:
import os

import pandas as pd
import numpy as np

from tqdm import tqdm

In [244]:
def create_vextors_meta_df(number_of_layers=4):
    vectors_meta_df = pd.DataFrame(columns=range(number_of_layers*768+3)).rename(columns=
        {
        number_of_layers*768: "Name",
        number_of_layers*768+1: "Profession",
        number_of_layers*768+2: "Document"
        })
    return vectors_meta_df

def generate_concat_vectors_meta_df(number_of_files):
    vectors_meta_df = create_vextors_meta_df()
    for i in tqdm(range(0, number_of_files)):
        #pprint(corpusNews[i])
        embeddings, metadata = generate_embeddings_Piter(corpusNews[i])
        vec = pd.DataFrame(embeddings)
        joined = vec.join(pd.DataFrame(metadata))
        joined["Document"] = "doc"+ str(i+1)
        vectors_meta_df = vectors_meta_df.append(joined)
    return vectors_meta_df


In [257]:
df = generate_concat_vectors_meta_df(NUMBER_OF_FILES_TO_OPEN)
df.to_csv("tsv_files/" + TYPE +"/" + CORPUS +"/results_words.tsv", sep="\t")

100%|██████████| 5013/5013 [07:24<00:00, 11.28it/s]


In [258]:
df= pd.read_csv("tsv_files/" + TYPE +"/" + CORPUS +"/results_words.tsv", sep="\t")
df.groupby(["Document", "Profession", "Name"]).mean().reset_index()

Unnamed: 0.1,Document,Profession,Name,Unnamed: 0,0,1,2,3,4,5,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
0,doc1,dziennikarze,Tomasz Sekielski,2.0,0.058421,0.058421,0.058421,0.058421,0.058421,0.058421,...,0.058421,0.058421,0.058421,0.058421,0.058421,0.058421,0.058421,0.058421,0.058421,0.058421
1,doc1,muzycy,Marian Filar,5.0,-0.185113,-0.185113,-0.185113,-0.185113,-0.185113,-0.185113,...,-0.185113,-0.185113,-0.185113,-0.185113,-0.185113,-0.185113,-0.185113,-0.185113,-0.185113,-0.185113
2,doc1002,duchowni,Wojciech Polak,0.0,0.148669,0.148669,0.148669,0.148669,0.148669,0.148669,...,0.148669,0.148669,0.148669,0.148669,0.148669,0.148669,0.148669,0.148669,0.148669,0.148669
3,doc1006,politycy,Janusz Korwin-Mikke,1.0,-0.050047,-0.050047,-0.050047,-0.050047,-0.050047,-0.050047,...,-0.050047,-0.050047,-0.050047,-0.050047,-0.050047,-0.050047,-0.050047,-0.050047,-0.050047,-0.050047
4,doc1006,politycy,Jarosław Gowin,0.0,-0.004990,-0.004990,-0.004990,-0.004990,-0.004990,-0.004990,...,-0.004990,-0.004990,-0.004990,-0.004990,-0.004990,-0.004990,-0.004990,-0.004990,-0.004990,-0.004990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4149,doc995,politycy,Bogdan Zdrojewski,1.5,-0.003693,-0.003693,-0.003693,-0.003693,-0.003693,-0.003693,...,-0.003693,-0.003693,-0.003693,-0.003693,-0.003693,-0.003693,-0.003693,-0.003693,-0.003693,-0.003693
4150,doc997,politycy,Antoni Macierewicz,7.0,-0.074773,-0.074773,-0.074773,-0.074773,-0.074773,-0.074773,...,-0.074773,-0.074773,-0.074773,-0.074773,-0.074773,-0.074773,-0.074773,-0.074773,-0.074773,-0.074773
4151,doc997,politycy,Donald Tusk,4.5,-0.038954,-0.038954,-0.038954,-0.038954,-0.038954,-0.038954,...,-0.038954,-0.038954,-0.038954,-0.038954,-0.038954,-0.038954,-0.038954,-0.038954,-0.038954,-0.038954
4152,doc997,politycy,Janusz Palikot,2.5,0.038415,0.038415,0.038415,0.038415,0.038415,0.038415,...,0.038415,0.038415,0.038415,0.038415,0.038415,0.038415,0.038415,0.038415,0.038415,0.038415


## Example usage

1. Create API object

In [None]:
bert = BertBaseMultilingualEmbeddingApi()

2. Create sentence and pass to feed_forward method

In [None]:
sentence = "Oszczędnością i pracą ludzie się bogacą."
bert.feed_forward(sentence)

3. Generate word/sentence embedding, specify `how`

In [None]:
words_embeddings = bert.create_word_embedding_(how = "sum_last_4")

In [None]:
sent_embedding = bert.create_sentence_embedding_()

4. Visualize tensor

In [None]:
print(sent_embedding)

In [None]:
bert.plot_embedding_hist(sent_embedding)

## Visualize embeddings in tensorboard

In [None]:
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [None]:
LOG_DIR = 'minimalsample'
NAME_TO_VISUALISE_VARIABLE = "example_embeddings"
path_for_mnist_metadata =  'metadata.tsv'

In [None]:
token_embeddings_np = torch.stack(concatenated_lats_4_layers).numpy()

In [None]:
embedding_var = tf.Variable(token_embeddings_np, name=NAME_TO_VISUALISE_VARIABLE)
summary_writer = tf.summary.FileWriter(LOG_DIR)

In [None]:
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = embedding_var.name

# Specify where you find the metadata
embedding.metadata_path = path_for_mnist_metadata #'metadata.tsv'

# Say that you want to visualise the embeddings
projector.visualize_embeddings(summary_writer, config)

In [None]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.save(sess, os.path.join(LOG_DIR, "model.ckpt"), 1)

In [None]:
with open(path_for_mnist_metadata,'w') as f:
    f.write("Index\tLabel\n")
    for index,label in enumerate(tokenized_text):
        f.write("%d\t %s\n" % (index,label.encode('utf-8')))