## Prerequisities

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
from os import listdir
from os.path import join
import random
import re
import numpy as np
import json
from operator import itemgetter
import string

In [None]:
main_path = 'gdrive/MyDrive/bot/data'

ft_path = os.path.join(main_path, 'fasttext/cc.pl.100.bin')
vectors_path = os.path.join(main_path, 'open_subtitles/1MB_vectors')
index_dict_path = os.path.join(main_path, 'sentence_similarity/index_dict2.json')
corpus_with_populars_path = os.path.join(main_path, 'sentence_similarity/corpus_with_populars2.txt')

### Fastext

In [None]:
!pip install fasttext

In [None]:
import fasttext

ft = fasttext.load_model(ft_path)

### Bert

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
from transformers import BertForMaskedLM
from transformers import BertTokenizer
from transformers import pipeline

### Morfeusz

In [None]:
!pip install morfeusz2

In [None]:
import morfeusz2
morf = morfeusz2.Morfeusz()

## MAIN CODE

In [None]:
word_vectors_dict = {}
def get_sentence_vector(sentence):
    words = sentence.split()
    # leave only letters and numbers
    words = [re.sub(r'[\W_]+', '', word) for word in words]
    
    word_vectors = []
    for word in words:
        if word not in word_vectors_dict:
            word_vectors_dict[word] = ft.get_word_vector(word)
        word_vectors.append(word_vectors_dict[word])

    word_vectors = list(map(np.array, word_vectors))
    vectors_sum = np.add.reduce(word_vectors)
    vectors_mean = vectors_sum / len(words)
    return vectors_mean

# get_sentence_vector("Jaki wektor ma to zdanie.")

In [None]:
def read_indexes_dict():
  with open(index_dict_path, "r", encoding="utf-8") as indexes:
    index_dict = json.load(indexes)
  return index_dict

In [None]:
index_dict = read_indexes_dict()

In [None]:
def load_corpus_line_offset():
  with open(corpus_with_populars_path, "rb") as file:
    # Read in the file once and build a list of line offsets
    line_offset = []
    offset = 0
    for line in file:
        line_offset.append(offset)
        offset += len(line)
    file.seek(0)
  return line_offset

In [None]:
line_offset = load_corpus_line_offset()

In [None]:
def find_cosine_similarity(A, B):
    return np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))

In [None]:


def similarity_by_embeddings_vectors_version(sentence_in, how_many, vector_file):
    vector_in = get_sentence_vector(sentence_in)
    sentence2similarity = {}
    with open(vector_file, 'r', encoding='utf8') as vectors:
        for line in vectors:
            line = json.loads(line)    
            sentence_out, vector_out = list(line.items())[0]
            similarity = find_cosine_similarity(vector_in, vector_out)
        
            if similarity > 0.5:
                sentence2similarity[sentence_out] = similarity
    res = dict(sorted(sentence2similarity.items(), key = itemgetter(1), reverse = True)[:how_many]) 
    return res

def get_synonyms(words):
  synonyms = []
  for word in words:
    s = ft.get_nearest_neighbors(word, k=5)
    s = [x[1] for x in s]
    s = [x.translate(str.maketrans('', '', string.punctuation)) for x in s]
    synonyms += s 
  return set(synonyms)

# newest version 29.08
def similarity_by_embeddings(input_message, words, how_many=1):
  mini_index_dict = {}
  words = get_synonyms(words)
  for word in words:
    if word in index_dict: 
      mini_index_dict[word] = set(index_dict[word])
  corpus_sentences = load_lines_from_corpus(mini_index_dict)
  input_message_vector = get_sentence_vector(input_message)
  sentence2cosine_similarity = {}
  for sentence in corpus_sentences:
      sentence2cosine_similarity[sentence] = find_cosine_similarity(input_message_vector, get_sentence_vector(sentence))
  sorted_sentence2cosine_similarity = {k: v for k, v in sorted(sentence2cosine_similarity.items(), key=lambda item: item[1], reverse=True)}
  best_answer = list(sorted_sentence2cosine_similarity.keys())[0]
  return best_answer

def load_lines_from_corpus(mini_index_dict):
  corpus_sentences = set()
  with open(corpus_with_populars_path, "r", encoding="utf-8") as file:
    for indexes in mini_index_dict.values():
      for i in indexes:
        file.seek(line_offset[i])
        try:
          corpus_sentences.add(file.readline())
        except:
          continue
  return corpus_sentences

In [None]:
#similarity_by_embeddings("Bardzo lubię czytać czasopisma", ['lubię', 'czytać', 'czasopisma'])

## Word generator

In [None]:
def get_pred_model():
    model = BertForMaskedLM.from_pretrained("allegro/herbert-large-cased")
    tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-large-cased")
    pred_pipeline = pipeline("fill-mask", model=model, tokenizer=tokenizer)
    return pred_pipeline

In [None]:
model = get_pred_model()

In [None]:
def find_all_subjects_and_verbs(sentence):
  analysis = morf.analyse(sentence)
  subjects = []
  verbs = []
  verbs_tags = ['verb', 'refl', 'nonrefl', 'perf', 'imperf', 'imperf.perf', 'praet', 'inf', 'fin']
  for i, j, interp in analysis:
    first_tag = interp[2].split(':')[0]
    if first_tag == 'subst':
      subjects.append(interp[0])
    elif first_tag in verbs_tags:
      verbs.append(interp[1])
  return subjects, verbs

In [None]:
def get_first_person(verb):
  first = ""
  generated_verbs = morf.generate(verb)
  for elem in generated_verbs:
    try:
      form, lem, s, i, j = elem 
      if "sg" in s and "pri" in s:
        first = form
    except:
      continue 
  return first

In [None]:
def get_synonym(word):
  neighbours = ft.get_nearest_neighbors(word, k=5)
  synonyms = [x[1] for x in neighbours]
  weights = [x[0] for x in neighbours]
  return random.choices(synonyms, weights)[0]

In [None]:
def get_noun_and_verb(sentence):
  nouns, verbs = find_all_subjects_and_verbs(sentence)
  verbs_first = list(map(get_first_person, verbs))
  nouns_synonyms = list(map(get_synonym, nouns))
  if nouns_synonyms:
    random_noun = random.choice(nouns_synonyms)
  else:
    random_noun = '.'
  if verbs_first:
    random_verb = random.choice(verbs_first)
  else:
    random_verb = 'jestem'
  return random_noun, random_verb

In [None]:
model(f"Stolicą Polski jest {model.tokenizer.mask_token}.")

In [None]:
def generate(sentence):
  noun, verb = get_noun_and_verb(sentence)
  print(verb, noun)
  first_gen = model(f"{verb} {model.tokenizer.mask_token} {noun}")[0]['sequence']
  second_gen = model(f"{model.tokenizer.mask_token} {first_gen}")[1]['sequence']
  # third_gen = model(f"{first_gen} {model.tokenizer.mask_token}.")[0]['sequence']
  return second_gen

In [None]:
generate("Czy lubisz jeść czekoladę?")

In [None]:
generate("Czy chciałbyś umieć latać?")

In [None]:
generate("Gdzie jedziesz na wakacje?")

In [None]:
generate("O której się budzisz?")

In [None]:
generate("Jakie masz ulubione danie?")