# NLP

## 1. Parsing

In [4]:
import requests
from bs4 import BeautifulSoup

def parse(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    title = soup.find('span','topic-body__title').text
    text = soup.findAll('p','topic-body__content-text')
    page_text = ''
    for i in text:
        page_text += i.text + " "
    return {"title" : title, "text" : page_text}

## 2. Translation

In [5]:
!pip install googletrans==4.0.0-rc1

from googletrans import Translator

def translate(text):
  trans = Translator()
  return trans.translate(text, src='ru', dest='en').text



Текст, выделенный полужирным шрифтом## 3. NER

In [6]:
!pip install gliner
!pip install fuzzywuzzy

Collecting gliner
  Using cached gliner-0.2.6-py3-none-any.whl (43 kB)
Collecting seqeval (from gliner)
  Using cached seqeval-1.2.2.tar.gz (43 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting onnxruntime (from gliner)
  Downloading onnxruntime-1.18.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.0.0->gliner)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.0.0->gliner)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.0.0->gliner)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.0.0-

In [7]:
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import fuzz
import numpy as np
import re
from gliner import GLiNER

model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")

def get_all_companies(text):
  labels = ["ORG"]
  return model.predict_entities(text, labels)

def get_unique_companies(companies):
  Comps = []
  for company in companies:
    company_name = company['text']
    Comps.append(company_name)
  return list(set(Comps))

def jaro_winkler_similarity_matrix(strings):
    size = len(strings)
    matrix = np.zeros((size, size))
    for i in range(size):
        for j in range(i, size):
            similarity = fuzz.WRatio(strings[i], strings[j])
            matrix[i, j] = similarity
            matrix[j, i] = similarity
    return matrix

def clasterize_companies(companies):
  similarity_matrix = jaro_winkler_similarity_matrix(companies)

  db = DBSCAN(eps=30, min_samples=1, metric="precomputed")
  labels = db.fit_predict(100 - similarity_matrix)

  unique_companies = {}
  for label, company in zip(labels, companies):
      if label not in unique_companies:
          unique_companies[label] = []
      unique_companies[label].append(company)

  res = []

  return [min(companies, key=len) for companies in unique_companies.values()]


def clasterize(companies):
  similarity_matrix = jaro_winkler_similarity_matrix(companies)

  db = DBSCAN(eps=30, min_samples=1, metric="precomputed")
  labels = db.fit_predict(100 - similarity_matrix)

  unique_companies = {}
  for label, company in zip(labels, companies):
      if label not in unique_companies:
          unique_companies[label] = []
      unique_companies[label].append(company)

  return unique_companies

def get_companies(text):
  try:
    return clasterize_companies(get_unique_companies(get_all_companies(text)))
  except:
    return []



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

gliner_multitask_performance.png:   0%|          | 0.00/76.8k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.76G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [8]:
import re


def extract_context(text, aspect):
    # Разбиваем текст на предложения
    sentences = re.split(r'(?<=[.!?])+', text)

    # Найдем индексы предложений, содержащих аспект
    aspect_indices = [i for i, sentence in enumerate(sentences) if aspect in sentence]

    # Выбираем предложения вокруг найденных аспектов
    context_sentences = []
    for index in aspect_indices:
        start_index = max(0, index - 1)
        end_index = min(len(sentences), index + 2)
        context_sentences.extend(sentences[start_index:end_index])

    # Объединяем выбранные предложения в один текст
    context_text = ' '.join(context_sentences)

    return context_text

## 4. ABSA

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load the ABSA model and tokenizer
absa_model_name = "yangheng/deberta-v3-base-absa-v1.1"
absa_tokenizer = AutoTokenizer.from_pretrained(absa_model_name)
absa_model = AutoModelForSequenceClassification.from_pretrained(absa_model_name)

classifier = pipeline("text-classification", model=absa_model, tokenizer=absa_tokenizer)

# def absa(text, aspect):
#   return classifier(f'[CLS] {extract_context(text, aspect)} [SEP] {aspect} [SEP]')
def absa(text, aspect):
  return classifier(f'[CLS] {text} [SEP] {aspect} [SEP]')

## 5. Finalize

In [11]:
def process_url(url):
  # 1. Parsing
  parsed_text = parse(url)['text']

  # 2. Translation
  translated_text = translate(parsed_text)

  # 3. NER
  companies = get_companies(translated_text)

  # 4. ABSA
  result = {}
  for company in companies:
    result[company] = absa(translated_text, company)[0]['label'].lower()

  return result

In [12]:
process_url('https://lenta.ru/news/2024/07/01/tesla-falls/')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'Reuters': 'negative',
 'Ilona Mask': 'negative',
 'Tesla': 'negative',
 'Barclays': 'negative'}

# Metrics

In [17]:
import json

In [49]:
Negative_vars = ['neg', 'negative', '-'] # -> 0
Neutral_vars = ['neutral', 'neu', '+-', '=']
Positive_vars = ['pos', 'positive', '+']

def label_hash(label):
  label = label.lower()
  if ('neg' in label) or (label in Negative_vars):
    return 0
  elif ('neu' in label) or (label in Neutral_vars):
    return 1
  else:
    return 2

def get_metrics(predicted, y):
  pred = []
  true = []
  for i in range(len(y)):
    true_companies = list(y[i].keys())
    predicted_companies = list(predicted[i].keys())
    clasterized_companies = clasterize(true_companies + predicted_companies)
    companies_in_clasterized_len = dict()
    for _, companies_group in clasterized_companies.items():
      for company in companies_group:
        companies_in_clasterized_len[company] = len(companies_group)
    for _, companies_group in clasterized_companies.items():
      if len(companies_group) != 1:
        true_company = ''
        pred_company = ''
        for company in companies_group:
          if company in true_companies:
            true_company = company
          if company in predicted_companies:
            pred_company = company
        if not(true_company == '' or pred_company==''):
          true.append(label_hash(y[i][true_company]))
          pred.append(label_hash(predicted[i][pred_company]))
          # print(i)
  return pred, true
def NER_metric(predicted, y):
  '''
  y - list of dicts
  '''
  Accuracy = 0
  Len = 0
  for i in range(len(y)):
    true_companies = list(y[i].keys())
    predicted_companies = list(predicted[i].keys())
    clasterized_companies = clasterize(true_companies + predicted_companies)
    companies_in_clasterized_len = dict()
    for _, companies_group in clasterized_companies.items():
      for company in companies_group:
        companies_in_clasterized_len[company] = len(companies_group)
    cur_accuracy = 0
    # print(true_companies)
    for company in true_companies:
      if companies_in_clasterized_len[company] != 1:
        cur_accuracy += 1
    cur_true_len = len(true_companies)
    Len += cur_true_len
    Accuracy += cur_accuracy
    # print(Accuracy)
    # print(Len)
  return Accuracy / Len

In [39]:
 with open("/content/data.json") as f:
  data = json.load(f)

In [40]:
from tqdm import tqdm

In [41]:
predicted = []
true_list = []
for i in tqdm(range(len(data) - 1)):
  true_list.append(data[i]["companies"])
  predicted.append(process_url(data[i]["url"]))

100%|██████████| 46/46 [10:23<00:00, 13.55s/it]


In [21]:
predicted

[{'Roskomnadzor': 'negative'},
 {'New York Post': 'negative'},
 {'State Duma': 'positive',
  'World Economic Forum': 'positive',
  'Council of Europe': 'positive',
  'PACE': 'positive',
  'TASS': 'positive'},
 {'The Servant of the People': 'neutral', 'Verkhovna Rada': 'neutral'},
 {'News.ru': 'neutral', 'Rosstat': 'neutral', 'EGN': 'neutral'},
 {'portal of legal information': 'neutral'},
 {'Institute of Engineering and Digital Technologies': 'positive',
  'BELGU': 'positive',
  'science TV': 'positive',
  'Belgorod State National Research University': 'positive'},
 {'Roskosmos': 'negative',
  'FRAGAT-made': 'negative',
  'Arianespace': 'negative',
  'OneWeb': 'negative',
  'Lavochka': 'negative'},
 {'PLA': 'neutral', 'Sinhua': 'neutral'},
 {'URA.RU': 'negative',
  'Ministry of Defense': 'negative',
  'Lenta.ru': 'negative'},
 {'Transpenseri International -Russia': 'neutral'},
 {'Verkhovna Rada': 'negative', 'Privatbank': 'negative'},
 {'Tiktok': 'neutral', 'Moscow Komsomolets': 'neutra

In [36]:
true_list

[{'Роскомнадзор': 'neu'},
 {'Lyst': 'pos'},
 {'Госдума': 'pos'},
 {'Страна.ua': 'neg', 'Верховная рада': 'neg'},
 {'News.ru': 'pos'},
 {'': 'neg'},
 {'Наука ТВ': 'pos'},
 {'Роскосмос': 'neg'},
 {'Синьхуа': 'pos'},
 {'URA.RU': 'pos'},
 {'Новая газета': 'neu'},
 {'Страна.ua': 'pos'},
 {'Московский комсомолец': 'neg'},
 {'Калуга': 'pos'},
 {'Ленты.ру': 'pos'},
 {'Тайга.Инфо': 'neu'},
 {'Ростелеком': 'pos', 'Нетрис': 'pos'},
 {'Единая Россия': 'neg'},
 {'РИА Новости': 'pos'},
 {'РИА Новости': 'pos'},
 {'Twitter': 'pos'},
 {'YouTube': 'neg'},
 {'НАТО': 'neg'},
 {'Ленте.ру': 'neg'},
 {'BBC News': 'neg'},
 {'Twitter': 'neu'},
 {'Звезда': 'neu'},
 {'Microsoft': 'neg'},
 {'РИА Новости': 'neg'},
 {'Ленты.ру': 'pos'},
 {'Anadolu': 'neu'},
 {'Страна.ua': 'neg'},
 {'The Guardian': 'neg'},
 {'': 'pos'},
 {'Ленты.ру': 'neu'},
 {'': 'neg'},
 {'Роспотребнадзор': 'pos'},
 {'': 'neg'},
 {'Metro': 'neu'},
 {'Страна.ua': 'neu'},
 {'Московский комсомолец': 'neu'},
 {'Интерфакс': 'neg', 'NielsenIQ': 'neg'},


In [42]:
new_list = []
for item in true_list:
  dictionary = {}
  for key in item.keys():
    try:
      dictionary[translate(key)] = item[key]
    except:
      dictionary[key] = item[key]
  new_list.append(dictionary)

In [50]:
pred, true = get_metrics(predicted, new_list)

In [44]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
confusion_matrix(true, pred)

array([[ 7,  8,  0],
       [ 0, 12,  0],
       [ 0,  1,  5]])

In [45]:
f1_score(true, pred, average='macro')

0.7575757575757577

In [46]:
accuracy_score(true, pred)

0.7272727272727273

In [47]:
NER_metric(predicted, new_list)

0.66