# **Импорт библиотек**



In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex
import re

In [None]:
import re
from transformers import BertTokenizer, BertModel
import torch
from sklearn.decomposition import PCA

In [None]:
from sklearn.decomposition import TruncatedSVD

import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [None]:
import networkx as nx

In [None]:
import matplotlib.pyplot as plt

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# **Инициализация функций**

In [None]:
# лемматизация
def lemma(word):
    doc = nlp(word)

    l = doc[0].lemma_
    return l

In [None]:
# удаление одиночных вершин
def remove_isolated_nodes(graph):
    isolated_nodes = [node for node in graph.nodes() if graph.degree(node) == 0]
    graph.remove_nodes_from(isolated_nodes)

In [None]:
# поиск нужного эмбеддинга
def find_embedding(word, bert_text):
    for elem in bert_text:
      if elem[0] == word:
        return elem[1]

    return None

In [None]:
# получение bert-эмбеддинга для каждого предложения
def get_bert_vectors(text):

  marked_text = "[CLS] " + text + " [SEP]"
  tokenized_text = tokenizer.tokenize(marked_text)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [1] * len(tokenized_text)
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  with torch.no_grad():
      outputs = model(tokens_tensor, segments_tensors)
      hidden_states = outputs[2]

  token_embeddings = torch.stack(hidden_states, dim=0)
  token_embeddings = torch.squeeze(token_embeddings, dim=1)
  token_embeddings = token_embeddings.permute(1, 0, 2)

  token_vecs_sum = []

  for token in token_embeddings:
      sum_vec = torch.sum(token[-4:], dim=0)
      token_vecs_sum.append(sum_vec)

  word_vectors = []
  current_word = ""
  current_vector = None

  for i, token_str in enumerate(tokenized_text):
      if token_str.startswith("##"):
          if current_vector is None:
              current_vector = token_vecs_sum[i]
              current_word = token_str
          else:
              current_vector += token_vecs_sum[i]
              current_word += token_str[2:]
      else:
          if current_vector is not None:
              word_vectors.append((current_word, current_vector.numpy()))
          current_word = token_str
          current_vector = token_vecs_sum[i]

  if current_vector is not None:
      word_vectors.append((current_word, current_vector.numpy()))

  return word_vectors[1:-1]

In [None]:
# создание графа униграмм
def create_dependency_graph(sentence):
    doc = nlp(sentence)

    G = nx.DiGraph()
    verb_noun_dict = {}
    embeddings = {}

    for sent in doc.sents:
      embs = get_bert_vectors(sent.text)

      for token in sent:
          if token.pos_ == 'VERB':
              verb = (token.text, token.i)
              if verb not in verb_noun_dict:
                  verb_noun_dict[verb] = set()

              for child in token.children:
                  if child.pos_ == 'NOUN' and child.text.isalpha():
                      noun = (child.text, child.i)
                      embeddings[noun] = find_embedding(noun[0], embs)
                      verb_noun_dict[verb].add(noun)

    for verb, nouns in verb_noun_dict.items():
        for noun1 in nouns:
              G.add_node(noun1)
              for noun2 in nouns:
                    if noun1 != noun2:
                        G.add_node(noun2)
                        if noun1[1] < noun2[1]:
                          G.add_edge(noun1, noun2, label=verb[0])
                        else:
                          G.add_edge(noun2, noun1, label=verb[0])

    return G, embeddings

In [None]:
# создание графа биграмм
def create_dependency_graph_bi(sentence):
    doc = nlp(sentence)

    G = nx.DiGraph()
    verb_bigram_dict = {}
    embeddings = {}

    for sent in doc.sents:

      embs = get_bert_vectors(sent.text)

      for token in sent:
          if token.pos_ == 'VERB':
              verb = (token.text, token.i)
              if verb not in verb_bigram_dict:
                  verb_bigram_dict[verb] = set()

              for child in token.children:
                  if child.pos_ == 'NOUN' and child.text.isalpha() and len(child.text) > 1:
                      noun = (child.text, child.i)
                      embeddings_noun = find_embedding(noun[0], embs)

                      for grandchild in child.children:
                        if grandchild.text.isalpha() and grandchild.pos_ not in ['VERB', 'CONJ', 'DET', 'ADP'] and len(grandchild.text) > 1:
                            dependent_word = (grandchild.text, grandchild.i)
                            embeddings_dependent_word = find_embedding(dependent_word[0], embs)

                            bigram = tuple(sorted([noun, dependent_word], key=lambda x: x[1]))
                            embeddings[bigram] = embeddings_noun + embeddings_dependent_word
                            verb_bigram_dict[verb].add(bigram)

    for verb, bigrams in verb_bigram_dict.items():
        for bigram1 in bigrams:
                if bigram1 not in G:
                  G.add_node(bigram1)
                for bigram2 in bigrams:
                    if bigram1 != bigram2 and not bigram1[0] in bigram2 and not bigram1[1] in bigram2 and not bigram2[0] in bigram1 and not bigram2[1] in bigram1:
                        G.add_node(bigram2)
                        if bigram1[0][1] < bigram2[0][1]:
                          G.add_edge(bigram1, bigram2, label=verb[0])
                        else:
                          G.add_edge(bigram2, bigram1, label=verb[0])

    return G, embeddings

In [None]:
# создание графа триграмм
def create_dependency_graph_tri(sentence):
    doc = nlp(sentence)

    G = nx.DiGraph()
    verb_trigram_dict = {}
    embeddings = {}

    for sent in doc.sents:

      embs = get_bert_vectors(sent.text)

      for token in sent:
          if token.pos_ == 'VERB':
              verb = (token.text, token.i)
              if verb not in verb_trigram_dict:
                  verb_trigram_dict[verb] = set()

              for child in token.children:
                  if child.pos_ == 'NOUN' and child.text.isalpha() and len(child.text) > 1:
                      noun = (child.text, child.i)
                      embeddings_noun = find_embedding(noun[0], embs)

                      for grandchild in child.children:
                        if grandchild.text.isalpha() and grandchild.pos not in ['VERB', 'DET'] and len(grandchild.text) > 1:
                            dependent_word = (grandchild.text, grandchild.i)
                            embeddings_dependent_word = find_embedding(dependent_word[0], embs)

                            for grandgrandchild in grandchild.children:
                                if grandgrandchild.pos not in ['VERB', 'DET', 'ADP'] and grandgrandchild.text.isalpha() and len(grandgrandchild.text) > 1:
                                  last_word = (grandgrandchild.text, grandgrandchild.i)
                                  embeddings_last_word = find_embedding(last_word[0], embs)

                                  trigram = tuple(sorted([noun, dependent_word, last_word], key=lambda x: x[1]))
                                  embeddings[trigram] = embeddings_noun + embeddings_dependent_word + embeddings_last_word
                                  verb_trigram_dict[verb].add(trigram)

    for verb, trigrams in verb_trigram_dict.items():
        for trigram1 in trigrams:
            G.add_node(trigram1)
            for trigram2 in trigrams:
                if trigram1 != trigram2 and not trigram1[0] in trigram2 and not trigram1[1] in trigram2 and not trigram1[2] in trigram2 and not trigram2[0] in trigram1 and not trigram2[1] in trigram1 and not trigram2[2] in trigram1:
                    G.add_node(trigram2)
                    if trigram1[0][1] < trigram2[0][1]:
                      G.add_edge(trigram1, trigram2, label=verb[0])
                    else:
                      G.add_edge(trigram2, trigram1, label=verb[0])

    return G, embeddings

In [None]:
# объединение семантически близких вершин
def merge_similar_vertices(graph, embeddings, similarity_threshold):
    vertices = list(graph.nodes())
    vertex_embeddings = np.array([embeddings[node] for node in vertices])

    similarities = cosine_similarity(vertex_embeddings)

    for i in range(len(vertices)):
        for j in range(i+1, len(vertices)):
            if vertices[i] in graph and vertices[j] in graph and vertices[i] in embeddings and vertices[j] in embeddings:
              if similarities[i, j] > similarity_threshold:
                  #print(vertices[i], vertices[j])
                  graph = nx.contracted_nodes(graph, vertices[i], vertices[j], self_loops=False)

    return graph