## Bert - POC

In [8]:
import numpy as np
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

import os
import logging
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [9]:
class BertBaseMultilingualEmbeddingApi:
    
    def __init__(self, model_name="bert-base-multilingual-cased", cuda=True):
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name).to(self.device)
        self.model.eval()
        
        self.tokens_tensor = None
        self.segments_tensor = None
        
        self.encoded_layers_ = None
        self.token_embeddings_ = None
        
    def _tokenize_text(self, text):
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = self.tokenizer.tokenize(marked_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        
        self.tokens_tensor = torch.tensor([indexed_tokens]).to(self.device)
        self.segments_tensor = torch.tensor([segments_ids]).to(self.device)
    
    def _evaluate(self):
        with torch.no_grad():
            encoded_layers, _ = self.model(self.tokens_tensor, self.segments_tensor)
        self.encoded_layers_ = encoded_layers
    
    def _generate_token_embeddings(self, batch_i=0):
        """
        Convert the hidden state embeddings into single token vectors
        Holds the list of 12 layer embeddings for each token
        Will have the shape: [# tokens, # layers, # features]
        """
        token_embeddings = [] 
        # For each token in the sentence...
        for token_i in range(len(self.encoded_layers_[-1][batch_i])):
            # Holds 12 layers of hidden states for each token 
            hidden_layers = [] 
            # For each of the 12 layers...
            for layer_i in range(len(self.encoded_layers_)):
            # Lookup the vector for `token_i` in `layer_i`
                vec = self.encoded_layers_[layer_i][batch_i][token_i]
                hidden_layers.append(vec)
            token_embeddings.append(hidden_layers)
        self.token_embeddings_ = token_embeddings
    
    def feed_forward(self, sentence):
        self._tokenize_text(sentence)
        self._evaluate()
        self._generate_token_embeddings()
    
    def create_word_embedding_(self, how="cat_last_4"):
        if how == "cat_last_4":
            return [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in self.token_embeddings_]
        elif how == "sum_last_4":
            return [torch.sum(torch.stack(layer)[-4:], 0) for layer in self.token_embeddings_]
        else:
            print("Redefine `how` parameter")
        
    def create_sentence_embedding_(self, how="mean_last_layer"):
        if how == "mean_last_layer":
            return torch.mean(self.encoded_layers_[-1], 1).squeeze()
        elif how == "mean_cat_last_4_layers":
            return torch.mean(torch.cat((self.encoded_layers_[-1], self.encoded_layers_[-2], self.encoded_layers_[-3], self.encoded_layers_[-4]), 2), 1).squeeze()
            #return torch.mean(torch.stack(self.create_word_embedding_("cat_last_4")), 0)
        elif how == "mean_sum_last_4_layers": 
            return torch.mean(torch.sum(torch.stack(self.encoded_layers_[-4:]), 0), 1).squeeze()
        else:
            print("Redefine `how` parameter", how)
            
    def print_dimensions_(self):
        print ("Number of layers:", len(self.encoded_layers_))
        layer_i = 0
        print ("Number of batches:", len(self.encoded_layers_[layer_i]))
        batch_i = 0
        print ("Number of tokens:", len(self.encoded_layers_[layer_i][batch_i]))
        token_i = 0
        print ("Number of hidden units:", len(self.encoded_layers_[layer_i][batch_i][token_i]))
        
    def plot_embedding_hist(self, vec):
        sns.set_style("whitegrid")
        plt.figure(figsize=(12, 8))
        sns.distplot(vec)
        plt.show()

## Text Preprocessing

In [42]:
import os

import pandas as pd
import numpy as np

from tqdm import tqdm

In [259]:
import codecs
from pprint import pprint
NUMBER_OF_FILES_TO_OPEN = 4998
CORPUS = "onet"

In [260]:
corpusNews = list()
for i in range(0,NUMBER_OF_FILES_TO_OPEN):
    try:
        f = codecs.open("doc/"+ CORPUS +"/doc"+str(i+1), "r", encoding = 'utf-8')
        lines = ""
        for line in f:
            lines += line
        corpusNews.append(lines)
        f.close()
    except FileNotFoundError:
        with open("doc/"+ CORPUS +"/doc"+str(i+1), "w", encoding = 'utf-8') as missing:
            pass
        corpusNews.append("")
        
pprint(corpusNews[0])
print(len(corpusNews))

('Według Loose tematem spotkania <Entity name="Jarosław Gowin" type="person" '
 'category="politycy">Gowina</Entity> z kierownictwem klubu PiS będą '
 'propozycje ustawowe ministerstwa sprawiedliwości dotyczące m.in. deregulacji '
 'zawodów oraz planowanych zmian w Kodeksie postępowania karnego. Za trzy '
 'tygodnie, 10 i 11 grudnia, ma odbyć się wysłuchanie publiczne w sprawie '
 'projektu częściowo lub całkowicie deregulującego dostęp do 50 zawodów - '
 'zdecydowała we wtorek na swym pierwszym merytorycznym posiedzeniu sejmowa '
 'komisja do spraw deregulacji. Wniosek w sprawie wysłuchania - zgłoszony '
 'przez przewodniczącego komisji <Entity name="Adam Szejnfeld" type="person" '
 'category="politycy">Adama Szejnfelda</Entity> (PO) - poparli wszyscy '
 'członkowie komisji. Podczas wysłuchania głos będą mogli zabrać '
 'przedstawiciele wszystkich zainteresowanych samorządów zawodowych i '
 'organizacji społecznych. Ich reprezentanci będą też uczestniczyć w '
 'posiedzeniach komisji d

Below are functions from Piter, that I did not changed (but use it differently):

In [12]:
def find_annotations(document : str):
    '''
    Searches for all occurances of '<' and '>' in the document.
    Returns lists of indexes of occurances opening for '<' and closing for '>'
    '''
    i = 0
    opening = list()
    closing = list()
    while i != -1:
        i = document.find('<', i)
        opening.append(i)
        if i == -1:
            closing.append(-1)
            break
        i = document.find('>', i)
        closing.append(i)
    closing = [cl + 1 for cl in closing]
    closing[-1] = -1
    return(opening, closing)

def get_annotation_values(text : str): # jest typ nie type, bo type jest zarezerwowana nazwa, nie jestem uposledzony
    '''
    Returns a dict consisting annotation values {'name', 'typ', 'category'} for first occuring annotation
    in the text.
    '''
    name_start = text.find('name=') + len('name=\"')
    name_end = text.find('\"', name_start)
    typ_start = text.find('type=', name_end) + len('type=\"')
    typ_end = text.find('\"', typ_start)
    category_start = text.find('category', typ_end) + len('category=\"')
    category_end = text.find('\"', category_start)
    return({'name' : text[name_start:name_end], 'typ' : text[typ_start:typ_end],
               'category' : text[category_start:category_end]})

def exclude_vectors_nsize(text, nsize = 3):
    '''
    Parameters:
    text - document string
    nsize - size of window around person word 
    Returns a list of lists build as follows [k_words before person, person, k_words after person], person name]
    '''
    global people_dict
    opn, cls = find_annotations(text)
    ind = 0
    l = list()
    for i in range(0, len(opn) - 1, 2):
        left_sentence = last_n(text[ind:opn[i]].split(' '), nsize)
        left_sentence = repair_sentence(left_sentence, nsize, left = True)
        right_sentence = first_n(text[cls[i+1]:text.find('<', cls[i+1])].split(' ') , nsize)
        right_sentence = repair_sentence(right_sentence, nsize, left = False)
        annotation = get_annotation_values(text[ind:-1])
        l.append([flat_list([left_sentence, right_sentence]), annotation["name"]])
        try:
            people_dict[annotation["category"]].add(annotation["name"])
        except KeyError:
            people_dict[annotation["category"]] = {annotation["name"]}
        ind = cls[i+1]
    return l

def exclude_vectors_for_person(list_of_vectors, person):
    '''
    list_of_vectors - return value from exclude_sentence_vectors()
    person - name of person from people_dict
    '''
    l = list()
    for el in list_of_vectors:
        if el[1] == person:
            l.append(el[0])
    return l

More functions from Piter:

In [13]:
def reverse(text : str):
    '''
    Returns reversed text.
    '''
    return(text[::-1])

def last_n(_list, n):
    '''
    Returns last n elements of list. 
    Returns full list if n is greater than list length or empty string if list is empty
    '''
    if not _list:
        return('')
    if _list[len(_list) - 1] == '':
        _list.pop()
    return(_list[-n:])

def first_n(_list, n):
    '''
    Returns first n elements of list. 
    Returns full list if n is greater than list length or empty string if list is empty
    '''
    if not _list:
        return('')
    if _list[0] == '':
        _list.pop(0)
    return(_list[:n])


In [14]:
words_with_dot = ['m.in.', 'inż.', 'prof.', 'tzn.', 'np.', 'cd.', 'al.', 'cnd.', 
                  'itp.', 'itd.', 'lek.', 'lic.', 'pl.', 'p.o.', 'św.', 'tj.', 
                  'tzw.', 'ul.', 'zob.', 'ul.']

punctuation = ['"', ',', '.', ':', '(', ')', '?', '!']

In [15]:
def split_the_word(word = '"Pas.chanacz:lolo)u(marek,pies!?"'):
    '''
    Splits the word with elements in punctuation list. Words in words_with_dot are excluded from splitting.
    Returns a list of splitted word. Splitting characters are included in the list.
    '''
    global words_with_dot
    global punctuation
    if word in words_with_dot:
        return([word])
    else:
        l = list()
        index_start = 0
        index_end = 0
        for i, char in enumerate(word):
            if char in punctuation:
                if index_start != index_end:
                    l.append(word[index_start:index_end])
                l.append(char)
                index_start = i + 1
                index_end = i + 1
            else:
                index_end = i + 1
        if index_start != index_end:
            l.append(word[index_start:index_end])
        return(l)
    
def flat_list(_list): # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
    '''
    Create one list from list of lists.
    '''
    flat_list = []
    for sublist in _list:
        for item in sublist:
            flat_list.append(item)
    return(flat_list)
    
def repair_sentence(_list, nsize, left):
    '''
    Naprawia zdanie xD. Chodzi o to, zeby oddzielic znaki interpunkcyjne.
    '''
    global words_with_dot
    l = list()
    for el in _list:
        if el not in words_with_dot:
            l.append(split_the_word(el))
        else:
            l.append([el])
    if left:
        return(last_n(flat_list(l), nsize))
    else:
        return(first_n(flat_list(l), nsize))

In [16]:
people_dict = {}

def exclude_vectors_nsize(text, nsize = 3):
    '''
    Parameters:
    text - document string
    nsize - size of window around person word 
    Returns a list of lists build as follows [k_words before person, person, k_words after person], person name]
    '''
    global people_dict
    opn, cls = find_annotations(text)
    ind = 0
    l = list()
    for i in range(0, len(opn) - 1, 2):
        left_sentence = last_n(text[ind:opn[i]].split(' '), nsize)
        left_sentence = repair_sentence(left_sentence, nsize, left = True)
        right_sentence = first_n(text[cls[i+1]:text.find('<', cls[i+1])].split(' ') , nsize)
        right_sentence = repair_sentence(right_sentence, nsize, left = False)
        annotation = get_annotation_values(text[ind:-1])
        l.append([flat_list([left_sentence, right_sentence]), annotation["name"]])
        try:
            people_dict[annotation["category"]].add(annotation["name"])
        except KeyError:
            people_dict[annotation["category"]] = {annotation["name"]}
        ind = cls[i+1]
    return l, people_dict

And our function for sentences:

In [17]:
def exclude_sentence_vectors(text):
    people_dict = {}
    sentences = text.split(".")
    contexts = []
    for sentence in sentences:
        opn, cls = find_annotations(sentence)
        if opn[0] != -1:
            #print(opn, cls)
            annotation = get_annotation_values(sentence)
            sentence = sentence[0:opn[0]] + sentence[cls[-2]:]
            contexts.append([sentence, annotation["name"]])
            try:
                people_dict[annotation["category"]].add(annotation["name"])
            except KeyError:
                people_dict[annotation["category"]] = {annotation["name"]}
    return contexts, people_dict


## vectors, people_dict = exclude_sentence_vectors(corpusNews[0])
print(vectors)
print(people_dict)

In [18]:
vectors, people_dict = exclude_vectors_nsize(corpusNews[0])
print(vectors) 
print(people_dict)
print(exclude_vectors_for_person(vectors, "Tomasz Sekielski"))

[[['Tomasza', 'oburzyła', 'wypowiedź', 'prof.'], 'Tomasz Sekielski'], [['prawa', '-', 'mówił', 'w', 'TOK', 'FM'], 'Tomasz Sekielski'], [['Wyborczej', '"', 'prof.', 'komentował', 'informacje', 'o'], 'Marian Filar'], [['zaskoczyły', 'i', 'oburzyły', '.', 'Dziennikarz', 'TVP'], 'Tomasz Sekielski'], [['"', '.', 'Zdaniem', 'prawnik', 'z', 'UMK'], 'Tomasz Sekielski'], [['morderca-pedofil', '-', 'powiedział'], 'Tomasz Sekielski']]
{'dziennikarze': {'Tomasz Sekielski'}, 'muzycy': {'Marian Filar'}}
[['Tomasza', 'oburzyła', 'wypowiedź', 'prof.'], ['prawa', '-', 'mówił', 'w', 'TOK', 'FM'], ['zaskoczyły', 'i', 'oburzyły', '.', 'Dziennikarz', 'TVP'], ['"', '.', 'Zdaniem', 'prawnik', 'z', 'UMK'], ['morderca-pedofil', '-', 'powiedział']]


In [19]:
vectors = exclude_vectors_nsize(corpusNews[0], 4)
pprint(vectors)

([[['Tomasza', 'oburzyła', 'wypowiedź', 'prof.', 'Mariana'],
   'Tomasz Sekielski'],
  [['łamać', 'prawa', '-', 'mówił', 'w', 'TOK', 'FM', '.'], 'Tomasz Sekielski'],
  [['Gazety',
    'Wyborczej',
    '"',
    'prof.',
    'komentował',
    'informacje',
    'o',
    'znalezieniu'],
   'Marian Filar'],
  [['prawnika',
    'zaskoczyły',
    'i',
    'oburzyły',
    '.',
    'Dziennikarz',
    'TVP',
    'przyznał'],
   'Tomasz Sekielski'],
  [['FM', '"', '.', 'Zdaniem', 'prawnik', 'z', 'UMK', 'w'], 'Tomasz Sekielski'],
  [['tylko', 'morderca-pedofil', '-', 'powiedział'], 'Tomasz Sekielski']],
 {'dziennikarze': {'Tomasz Sekielski'}, 'muzycy': {'Marian Filar'}})


In [20]:
people_dict

{'dziennikarze': {'Tomasz Sekielski'}, 'muzycy': {'Marian Filar'}}

## Generating tsv files

In [98]:
NAME = "Name"
EMBEDDING = "embedding"
PROFESSION = "Profession"
TYPE = "mean_cat_last_4_layers"


In [371]:
def generate_embeddings_words(news_file):
    embeddings = []
    metadata = []
    vectors, people_dict = exclude_vectors_nsize(news_file, 4)
    for profession in people_dict:
        names = people_dict[profession]
        for name in names:
            person_contexts = exclude_vectors_for_person(vectors, name)
            for context in person_contexts:
                concatenated = " ".join(c for c in context)
                bert.feed_forward(concatenated)
                sent_embedding = bert.create_sentence_embedding_(how = TYPE)
                embeddings.append(sent_embedding)
                metadata.append({NAME: name, PROFESSION: profession})
    return embeddings, metadata
        

In [370]:
def generate_embeddings_sentence(news_file):
    embeddings = []
    metadata = []
    vectors, people_dict = exclude_sentence_vectors(news_file)
    for profession in people_dict:
        names = people_dict[profession]
        for name in names:
            person_contexts = exclude_vectors_for_person(vectors, name)
            for context in person_contexts:
                bert.feed_forward(context)
                sent_embedding = bert.create_sentence_embedding_(how = TYPE)
                embeddings.append(sent_embedding)
                metadata.append({NAME: name, PROFESSION: profession})
    return  embeddings, metadata
        

In [396]:
def create_vextors_meta_df(number_of_layers=4):
    vectors_meta_df = pd.DataFrame(columns=range(number_of_layers*768+3)).rename(columns=
        {
        number_of_layers*768: "Name",
        number_of_layers*768+1: "Profession",
        number_of_layers*768+2: "Document"
        })
    return vectors_meta_df

def generate_concat_vectors_meta_df(number_of_files):
    vectors_meta_df = create_vextors_meta_df()
    for i in tqdm(range(0, number_of_files)):
        #pprint(corpusNews[i])
        embeddings, metadata = generate_embeddings_words(corpusNews[i])
        frames = []
        for e in embeddings:
            values = [t.item() for t in e]
            frames.append(pd.DataFrame(values).T)
        try:
            fr = pd.concat(frames)
            joined = fr.join(pd.DataFrame(metadata))
            joined["Document"] = "doc"+ str(i+1)
            vectors_meta_df = vectors_meta_df.append(joined)
        except ValueError:
            continue
            
    return vectors_meta_df


In [399]:
df = generate_concat_vectors_meta_df(NUMBER_OF_FILES_TO_OPEN)
df.to_csv("tsv_files/" + TYPE +"/" + CORPUS +"/results_words1.tsv", sep="\t")


  0%|          | 0/4998 [00:00<?, ?it/s][A
  0%|          | 1/4998 [00:00<12:23,  6.73it/s][A
  0%|          | 2/4998 [00:00<21:09,  3.93it/s][A
  0%|          | 7/4998 [00:00<15:32,  5.35it/s][A
  0%|          | 10/4998 [00:00<12:04,  6.89it/s][A
  0%|          | 12/4998 [00:01<11:26,  7.26it/s][A
  0%|          | 14/4998 [00:01<09:48,  8.47it/s][A
  0%|          | 16/4998 [00:01<11:20,  7.32it/s][A
  0%|          | 24/4998 [00:01<08:23,  9.89it/s][A
  1%|          | 27/4998 [00:02<10:43,  7.73it/s][A
  1%|          | 37/4998 [00:02<07:51, 10.52it/s][A
  1%|          | 41/4998 [00:03<09:49,  8.41it/s][A
  1%|          | 44/4998 [00:03<09:09,  9.01it/s][A
  1%|          | 47/4998 [00:04<12:16,  6.72it/s][A
  1%|          | 49/4998 [00:04<11:34,  7.13it/s][A
  1%|          | 51/4998 [00:05<16:19,  5.05it/s][A
  1%|          | 54/4998 [00:05<15:54,  5.18it/s][A
  1%|          | 59/4998 [00:06<13:09,  6.25it/s][A
  1%|          | 60/4998 [00:06<20:16,  4.06it/s][A
  1%

  8%|▊         | 420/4998 [00:50<11:24,  6.69it/s][A
  8%|▊         | 422/4998 [00:50<09:36,  7.94it/s][A
  8%|▊         | 424/4998 [00:51<16:47,  4.54it/s][A
  9%|▊         | 426/4998 [00:51<14:23,  5.30it/s][A
  9%|▊         | 427/4998 [00:52<17:27,  4.36it/s][A
  9%|▊         | 433/4998 [00:52<12:36,  6.03it/s][A
  9%|▊         | 436/4998 [00:52<10:37,  7.16it/s][A
  9%|▉         | 445/4998 [00:52<07:40,  9.88it/s][A
  9%|▉         | 449/4998 [00:53<08:28,  8.94it/s][A
  9%|▉         | 453/4998 [00:53<10:22,  7.31it/s][A
  9%|▉         | 456/4998 [00:54<08:04,  9.37it/s][A
  9%|▉         | 459/4998 [00:54<06:47, 11.15it/s][A
  9%|▉         | 462/4998 [00:54<08:35,  8.80it/s][A
  9%|▉         | 464/4998 [00:55<10:12,  7.41it/s][A
  9%|▉         | 468/4998 [00:55<07:59,  9.45it/s][A
  9%|▉         | 473/4998 [00:55<06:04, 12.43it/s][A
 10%|▉         | 476/4998 [00:55<09:27,  7.96it/s][A
 10%|▉         | 480/4998 [00:56<07:28, 10.08it/s][A
 10%|▉         | 483/4998 [0

 17%|█▋        | 865/4998 [01:34<08:50,  7.80it/s][A
 17%|█▋        | 867/4998 [01:35<09:41,  7.11it/s][A
 17%|█▋        | 869/4998 [01:35<12:52,  5.34it/s][A
 17%|█▋        | 871/4998 [01:35<10:31,  6.54it/s][A
 17%|█▋        | 873/4998 [01:36<09:23,  7.32it/s][A
 18%|█▊        | 875/4998 [01:36<09:01,  7.61it/s][A
 18%|█▊        | 878/4998 [01:36<08:30,  8.07it/s][A
 18%|█▊        | 879/4998 [01:36<12:40,  5.41it/s][A
 18%|█▊        | 882/4998 [01:37<10:31,  6.51it/s][A
 18%|█▊        | 884/4998 [01:37<09:55,  6.91it/s][A
 18%|█▊        | 887/4998 [01:37<07:41,  8.91it/s][A
 18%|█▊        | 889/4998 [01:37<08:51,  7.73it/s][A
 18%|█▊        | 891/4998 [01:38<09:06,  7.52it/s][A
 18%|█▊        | 893/4998 [01:38<11:13,  6.10it/s][A
 18%|█▊        | 894/4998 [01:38<09:55,  6.89it/s][A
 18%|█▊        | 895/4998 [01:39<12:44,  5.36it/s][A
 18%|█▊        | 898/4998 [01:39<09:42,  7.04it/s][A
 18%|█▊        | 902/4998 [01:39<08:03,  8.47it/s][A
 18%|█▊        | 904/4998 [0

 24%|██▍       | 1188/4998 [02:27<08:31,  7.44it/s][A
 24%|██▍       | 1190/4998 [02:27<07:57,  7.98it/s][A
 24%|██▍       | 1192/4998 [02:27<06:56,  9.14it/s][A
 24%|██▍       | 1194/4998 [02:28<06:37,  9.56it/s][A
 24%|██▍       | 1196/4998 [02:28<06:06, 10.36it/s][A
 24%|██▍       | 1199/4998 [02:28<06:03, 10.44it/s][A
 24%|██▍       | 1203/4998 [02:28<05:48, 10.90it/s][A
 24%|██▍       | 1205/4998 [02:29<06:21,  9.94it/s][A
 24%|██▍       | 1207/4998 [02:29<07:59,  7.90it/s][A
 24%|██▍       | 1208/4998 [02:29<10:04,  6.27it/s][A
 24%|██▍       | 1209/4998 [02:29<09:04,  6.96it/s][A
 24%|██▍       | 1210/4998 [02:30<14:41,  4.30it/s][A
 24%|██▍       | 1213/4998 [02:30<11:33,  5.46it/s][A
 24%|██▍       | 1215/4998 [02:30<11:06,  5.68it/s][A
 24%|██▍       | 1216/4998 [02:30<10:28,  6.01it/s][A
 24%|██▍       | 1220/4998 [02:31<08:15,  7.62it/s][A
 24%|██▍       | 1224/4998 [02:31<06:54,  9.10it/s][A
 25%|██▍       | 1226/4998 [02:31<06:35,  9.53it/s][A
 25%|██▍  

 31%|███▏      | 1568/4998 [03:07<02:37, 21.74it/s][A
 32%|███▏      | 1577/4998 [03:07<02:08, 26.55it/s][A
 32%|███▏      | 1581/4998 [03:08<03:11, 17.88it/s][A
 32%|███▏      | 1584/4998 [03:08<05:59,  9.51it/s][A
 32%|███▏      | 1587/4998 [03:09<06:42,  8.46it/s][A
 32%|███▏      | 1590/4998 [03:09<06:57,  8.16it/s][A
 32%|███▏      | 1592/4998 [03:09<06:07,  9.27it/s][A
 32%|███▏      | 1598/4998 [03:10<05:34, 10.15it/s][A
 32%|███▏      | 1600/4998 [03:10<05:55,  9.56it/s][A
 32%|███▏      | 1602/4998 [03:11<08:23,  6.75it/s][A
 32%|███▏      | 1608/4998 [03:11<06:33,  8.61it/s][A
 32%|███▏      | 1611/4998 [03:12<09:55,  5.68it/s][A
 32%|███▏      | 1613/4998 [03:12<09:37,  5.86it/s][A
 32%|███▏      | 1618/4998 [03:13<07:47,  7.22it/s][A
 32%|███▏      | 1620/4998 [03:13<06:19,  8.90it/s][A
 32%|███▏      | 1624/4998 [03:13<05:26, 10.32it/s][A
 33%|███▎      | 1627/4998 [03:14<09:32,  5.88it/s][A
 33%|███▎      | 1631/4998 [03:14<07:10,  7.82it/s][A
 33%|███▎ 

 39%|███▉      | 1939/4998 [03:57<06:32,  7.79it/s][A
 39%|███▉      | 1943/4998 [03:57<05:13,  9.73it/s][A
 39%|███▉      | 1945/4998 [03:58<09:33,  5.32it/s][A
 39%|███▉      | 1947/4998 [03:58<08:07,  6.26it/s][A
 39%|███▉      | 1949/4998 [03:58<07:25,  6.84it/s][A
 39%|███▉      | 1951/4998 [03:59<06:08,  8.27it/s][A
 39%|███▉      | 1955/4998 [03:59<06:17,  8.06it/s][A
 39%|███▉      | 1958/4998 [03:59<05:52,  8.62it/s][A
 39%|███▉      | 1963/4998 [04:00<04:44, 10.66it/s][A
 39%|███▉      | 1965/4998 [04:00<04:21, 11.61it/s][A
 39%|███▉      | 1967/4998 [04:00<03:51, 13.11it/s][A
 39%|███▉      | 1969/4998 [04:01<08:41,  5.80it/s][A
 39%|███▉      | 1971/4998 [04:01<09:33,  5.28it/s][A
 40%|███▉      | 1976/4998 [04:01<07:47,  6.47it/s][A
 40%|███▉      | 1978/4998 [04:02<08:38,  5.83it/s][A
 40%|███▉      | 1980/4998 [04:02<06:51,  7.34it/s][A
 40%|███▉      | 1983/4998 [04:02<05:26,  9.24it/s][A
 40%|███▉      | 1987/4998 [04:02<04:41, 10.68it/s][A
 40%|███▉ 

 47%|████▋     | 2351/4998 [04:41<06:06,  7.22it/s][A
 47%|████▋     | 2360/4998 [04:42<04:39,  9.42it/s][A
 47%|████▋     | 2363/4998 [04:42<03:55, 11.21it/s][A
 47%|████▋     | 2366/4998 [04:42<04:02, 10.88it/s][A
 47%|████▋     | 2370/4998 [04:43<04:53,  8.95it/s][A
 47%|████▋     | 2372/4998 [04:43<04:06, 10.63it/s][A
 47%|████▋     | 2374/4998 [04:43<03:40, 11.91it/s][A
 48%|████▊     | 2376/4998 [04:43<05:10,  8.45it/s][A
 48%|████▊     | 2380/4998 [04:44<04:02, 10.80it/s][A
 48%|████▊     | 2382/4998 [04:44<04:29,  9.70it/s][A
 48%|████▊     | 2385/4998 [04:44<04:43,  9.22it/s][A
 48%|████▊     | 2387/4998 [04:44<05:22,  8.09it/s][A
 48%|████▊     | 2389/4998 [04:45<05:33,  7.82it/s][A
 48%|████▊     | 2390/4998 [04:45<08:34,  5.07it/s][A
 48%|████▊     | 2404/4998 [04:45<06:07,  7.06it/s][A
 48%|████▊     | 2408/4998 [04:46<05:14,  8.23it/s][A
 48%|████▊     | 2411/4998 [04:46<04:36,  9.37it/s][A
 48%|████▊     | 2419/4998 [04:46<03:29, 12.34it/s][A
 49%|████▊

 56%|█████▋    | 2819/4998 [05:25<06:05,  5.96it/s][A
 56%|█████▋    | 2821/4998 [05:25<05:05,  7.14it/s][A
 57%|█████▋    | 2824/4998 [05:26<05:59,  6.04it/s][A
 57%|█████▋    | 2825/4998 [05:26<11:59,  3.02it/s][A
 57%|█████▋    | 2828/4998 [05:27<09:29,  3.81it/s][A
 57%|█████▋    | 2834/4998 [05:27<06:56,  5.20it/s][A
 57%|█████▋    | 2836/4998 [05:27<08:47,  4.10it/s][A
 57%|█████▋    | 2838/4998 [05:28<06:44,  5.34it/s][A
 57%|█████▋    | 2840/4998 [05:28<06:35,  5.46it/s][A
 57%|█████▋    | 2842/4998 [05:28<07:27,  4.81it/s][A
 57%|█████▋    | 2844/4998 [05:29<06:23,  5.61it/s][A
 57%|█████▋    | 2846/4998 [05:30<09:21,  3.84it/s][A
 57%|█████▋    | 2848/4998 [05:30<08:45,  4.09it/s][A
 57%|█████▋    | 2851/4998 [05:30<06:30,  5.49it/s][A
 57%|█████▋    | 2853/4998 [05:30<06:11,  5.78it/s][A
 57%|█████▋    | 2855/4998 [05:31<05:29,  6.50it/s][A
 57%|█████▋    | 2859/4998 [05:31<04:39,  7.65it/s][A
 57%|█████▋    | 2861/4998 [05:32<06:37,  5.38it/s][A
 57%|█████

 63%|██████▎   | 3162/4998 [06:12<11:44,  2.61it/s][A
 63%|██████▎   | 3163/4998 [06:12<10:02,  3.04it/s][A
 63%|██████▎   | 3165/4998 [06:13<08:07,  3.76it/s][A
 63%|██████▎   | 3167/4998 [06:13<07:02,  4.33it/s][A
 63%|██████▎   | 3168/4998 [06:13<06:32,  4.66it/s][A
 63%|██████▎   | 3169/4998 [06:13<06:35,  4.62it/s][A
 63%|██████▎   | 3170/4998 [06:13<06:38,  4.59it/s][A
 63%|██████▎   | 3172/4998 [06:14<05:50,  5.21it/s][A
 64%|██████▎   | 3174/4998 [06:14<04:47,  6.33it/s][A
 64%|██████▎   | 3175/4998 [06:14<04:22,  6.94it/s][A
 64%|██████▎   | 3176/4998 [06:14<04:05,  7.41it/s][A
 64%|██████▎   | 3178/4998 [06:14<03:28,  8.74it/s][A
 64%|██████▎   | 3180/4998 [06:14<03:07,  9.70it/s][A
 64%|██████▎   | 3183/4998 [06:15<02:51, 10.59it/s][A
 64%|██████▎   | 3186/4998 [06:15<03:56,  7.66it/s][A
 64%|██████▍   | 3187/4998 [06:15<04:08,  7.28it/s][A
 64%|██████▍   | 3188/4998 [06:16<08:21,  3.61it/s][A
 64%|██████▍   | 3189/4998 [06:17<11:05,  2.72it/s][A
 64%|█████

 71%|███████   | 3558/4998 [07:01<02:58,  8.09it/s][A
 71%|███████▏  | 3562/4998 [07:01<02:23,  9.99it/s][A
 71%|███████▏  | 3564/4998 [07:01<02:46,  8.62it/s][A
 71%|███████▏  | 3566/4998 [07:02<04:39,  5.13it/s][A
 71%|███████▏  | 3567/4998 [07:02<04:22,  5.46it/s][A
 72%|███████▏  | 3574/4998 [07:02<03:10,  7.46it/s][A
 72%|███████▏  | 3577/4998 [07:03<03:15,  7.26it/s][A
 72%|███████▏  | 3579/4998 [07:04<06:02,  3.91it/s][A
 72%|███████▏  | 3583/4998 [07:04<04:29,  5.24it/s][A
 72%|███████▏  | 3585/4998 [07:04<03:53,  6.05it/s][A
 72%|███████▏  | 3587/4998 [07:06<07:24,  3.17it/s][A
 72%|███████▏  | 3589/4998 [07:06<06:37,  3.55it/s][A
 72%|███████▏  | 3593/4998 [07:06<04:51,  4.82it/s][A
 72%|███████▏  | 3596/4998 [07:07<04:30,  5.19it/s][A
 72%|███████▏  | 3599/4998 [07:07<03:41,  6.30it/s][A
 72%|███████▏  | 3601/4998 [07:07<03:26,  6.77it/s][A
 72%|███████▏  | 3603/4998 [07:08<04:55,  4.72it/s][A
 72%|███████▏  | 3606/4998 [07:08<03:49,  6.06it/s][A
 72%|█████

 79%|███████▉  | 3954/4998 [07:49<01:57,  8.88it/s][A
 79%|███████▉  | 3956/4998 [07:49<02:12,  7.88it/s][A
 79%|███████▉  | 3958/4998 [07:49<01:50,  9.43it/s][A
 79%|███████▉  | 3962/4998 [07:50<02:01,  8.51it/s][A
 79%|███████▉  | 3969/4998 [07:50<01:29, 11.44it/s][A
 79%|███████▉  | 3972/4998 [07:50<01:23, 12.26it/s][A
 80%|███████▉  | 3975/4998 [07:50<01:42,  9.97it/s][A
 80%|███████▉  | 3977/4998 [07:51<02:00,  8.48it/s][A
 80%|███████▉  | 3980/4998 [07:51<02:00,  8.43it/s][A
 80%|███████▉  | 3982/4998 [07:51<01:52,  9.01it/s][A
 80%|███████▉  | 3985/4998 [07:52<01:53,  8.94it/s][A
 80%|███████▉  | 3989/4998 [07:52<01:29, 11.25it/s][A
 80%|███████▉  | 3991/4998 [07:53<02:58,  5.65it/s][A
 80%|███████▉  | 3994/4998 [07:53<02:20,  7.14it/s][A
 80%|████████  | 3999/4998 [07:53<01:45,  9.50it/s][A
 80%|████████  | 4002/4998 [07:53<02:13,  7.44it/s][A
 80%|████████  | 4004/4998 [07:54<01:52,  8.85it/s][A
 80%|████████  | 4007/4998 [07:54<01:50,  8.98it/s][A
 80%|█████

 88%|████████▊ | 4390/4998 [08:34<01:51,  5.46it/s][A
 88%|████████▊ | 4393/4998 [08:35<01:49,  5.50it/s][A
 88%|████████▊ | 4395/4998 [08:35<01:56,  5.16it/s][A
 88%|████████▊ | 4399/4998 [08:35<01:26,  6.93it/s][A
 88%|████████▊ | 4403/4998 [08:36<01:15,  7.89it/s][A
 88%|████████▊ | 4405/4998 [08:36<01:06,  8.90it/s][A
 88%|████████▊ | 4409/4998 [08:36<00:56, 10.48it/s][A
 88%|████████▊ | 4415/4998 [08:36<00:48, 11.98it/s][A
 88%|████████▊ | 4418/4998 [08:37<01:02,  9.26it/s][A
 88%|████████▊ | 4420/4998 [08:37<00:54, 10.57it/s][A
 88%|████████▊ | 4422/4998 [08:37<01:03,  9.01it/s][A
 89%|████████▊ | 4424/4998 [08:38<01:13,  7.80it/s][A
 89%|████████▊ | 4426/4998 [08:38<01:06,  8.65it/s][A
 89%|████████▊ | 4428/4998 [08:38<00:59,  9.53it/s][A
 89%|████████▊ | 4430/4998 [08:38<01:19,  7.11it/s][A
 89%|████████▊ | 4433/4998 [08:39<01:10,  7.98it/s][A
 89%|████████▊ | 4435/4998 [08:39<01:18,  7.19it/s][A
 89%|████████▉ | 4438/4998 [08:39<01:02,  8.94it/s][A
 89%|█████

 94%|█████████▎| 4675/4998 [09:23<00:47,  6.74it/s][A
 94%|█████████▎| 4677/4998 [09:23<00:41,  7.70it/s][A
 94%|█████████▎| 4679/4998 [09:23<00:47,  6.74it/s][A
 94%|█████████▎| 4680/4998 [09:23<00:52,  6.11it/s][A
 94%|█████████▎| 4681/4998 [09:24<00:47,  6.67it/s][A
 94%|█████████▎| 4682/4998 [09:24<01:04,  4.88it/s][A
 94%|█████████▎| 4683/4998 [09:24<00:58,  5.37it/s][A
 94%|█████████▎| 4685/4998 [09:24<00:47,  6.59it/s][A
 94%|█████████▍| 4688/4998 [09:24<00:37,  8.25it/s][A
 94%|█████████▍| 4692/4998 [09:25<00:30,  9.94it/s][A
 94%|█████████▍| 4694/4998 [09:25<00:31,  9.75it/s][A
 94%|█████████▍| 4697/4998 [09:25<00:28, 10.55it/s][A
 94%|█████████▍| 4699/4998 [09:26<00:54,  5.52it/s][A
 94%|█████████▍| 4701/4998 [09:26<00:44,  6.71it/s][A
 94%|█████████▍| 4705/4998 [09:26<00:42,  6.92it/s][A
 94%|█████████▍| 4706/4998 [09:27<00:40,  7.28it/s][A
 94%|█████████▍| 4707/4998 [09:27<00:57,  5.10it/s][A
 94%|█████████▍| 4708/4998 [09:27<01:15,  3.84it/s][A
 94%|█████

In [398]:
df= pd.read_csv("tsv_files/" + TYPE +"/" + CORPUS +"/results_words1.tsv", sep="\t")
df.groupby(["Document", "Profession", "Name"]).mean().reset_index()

Unnamed: 0.1,Document,Profession,Name,Unnamed: 0,0,1,2,3,4,5,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
0,doc1,politycy,Jarosław Gowin,0,-0.077040,-0.280966,0.888300,0.159213,0.245626,0.056108,...,0.080211,-0.552011,-0.368075,-0.269050,0.388620,-0.389159,-0.556412,-0.038339,0.084515,0.260660
1,doc10,politycy,Maciej Rataj,0,-0.020054,-0.284341,0.463016,0.084573,0.338564,0.162313,...,0.159499,-0.388542,-0.749628,-0.477634,0.385759,-0.376551,-0.421735,0.052833,0.489614,0.217435
2,doc101,politycy,Irena Lipowicz,0,-0.105039,-0.296177,0.631779,0.111769,0.215479,0.265545,...,0.289515,-0.605164,-0.632973,-0.576649,0.332326,-0.460350,-0.496762,0.123326,0.203858,-0.101886
3,doc102,politycy,Donald Tusk,0,0.140507,-0.169071,0.364256,0.080722,0.384109,0.361359,...,0.289154,-0.500774,-0.549308,-0.341060,0.213128,-0.360049,-0.388345,0.332273,0.487607,0.087700
4,doc103,politycy,Joachim Brudziński,0,-0.025080,-0.288531,0.529188,0.138812,0.329439,0.141213,...,0.123808,-0.153015,-0.557275,-0.366970,0.220043,-0.469725,-0.580212,0.030807,0.273209,0.257715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,doc94,politycy,Waldemar Pawlak,0,-0.007887,-0.293649,0.437457,0.156701,0.340602,0.091654,...,0.087135,-0.139974,-0.659284,-0.553864,0.254653,-0.489999,-0.823888,-0.305623,-0.055890,0.362783
164,doc95,politycy,Włodzimierz Cimoszewicz,0,-0.067523,-0.330465,0.511320,0.000298,0.286162,0.271232,...,0.118256,-0.672239,-0.390406,-0.665017,0.473673,-0.410814,-0.529500,0.054179,0.165175,-0.106471
165,doc96,dziennikarze,Jerzy Giedroyc,0,0.036858,-0.314028,0.372681,0.051369,0.335927,0.161099,...,0.201822,-0.215329,-0.532317,-0.441958,0.221541,-0.465711,-0.434263,0.139180,0.407487,0.157164
166,doc98,politycy,Stanisław Żelichowski,0,0.005060,-0.243434,0.417410,0.121783,0.260104,0.215375,...,0.174754,-0.513138,-0.605559,-0.462636,0.372224,-0.375303,-0.466680,0.130805,0.329049,0.057914


## Example usage

1. Create API object

In [None]:
bert = BertBaseMultilingualEmbeddingApi()

2. Create sentence and pass to feed_forward method

In [None]:
sentence = "Oszczędnością i pracą ludzie się bogacą."
bert.feed_forward(sentence)

3. Generate word/sentence embedding, specify `how`

In [None]:
words_embeddings = bert.create_word_embedding_(how = "sum_last_4")

In [None]:
sent_embedding = bert.create_sentence_embedding_()

4. Visualize tensor

In [None]:
print(sent_embedding)

In [None]:
bert.plot_embedding_hist(sent_embedding)

## Visualize embeddings in tensorboard

In [None]:
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [None]:
LOG_DIR = 'minimalsample'
NAME_TO_VISUALISE_VARIABLE = "example_embeddings"
path_for_mnist_metadata =  'metadata.tsv'

In [None]:
token_embeddings_np = torch.stack(concatenated_lats_4_layers).numpy()

In [None]:
embedding_var = tf.Variable(token_embeddings_np, name=NAME_TO_VISUALISE_VARIABLE)
summary_writer = tf.summary.FileWriter(LOG_DIR)

In [None]:
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = embedding_var.name

# Specify where you find the metadata
embedding.metadata_path = path_for_mnist_metadata #'metadata.tsv'

# Say that you want to visualise the embeddings
projector.visualize_embeddings(summary_writer, config)

In [None]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.save(sess, os.path.join(LOG_DIR, "model.ckpt"), 1)

In [None]:
with open(path_for_mnist_metadata,'w') as f:
    f.write("Index\tLabel\n")
    for index,label in enumerate(tokenized_text):
        f.write("%d\t %s\n" % (index,label.encode('utf-8')))