In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import copy
import re
from collections import defaultdict
from tqdm import tqdm
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

# Preprocess texts

In [2]:
df = pd.read_csv("news.csv.zip", index_col=0)
df.head()

Unnamed: 0,url,text,company,cleaned_texts
0,https://www.finam.ru/publications/item/neftega...,Аналитики «Финама» подготовили стратегию по не...,gazprom,аналитик финама подготовить стратегия нефтегаз...
1,https://bonds.finam.ru/news/item/roznichnye-in...,"МОСКВА, 26 июл - РИА Новости/Прайм. Высокий ин...",gazprom,москва июл риа новости/прайм высокий интерес в...
2,https://www.finam.ru/publications/item/v-tsena...,"Цены на газ остаются высокими, хотя плавучие х...",gazprom,цена газ оставаться высокий хотя плавучий хран...
3,https://www.finam.ru/publications/item/gazprom...,«Газпром» в ходе аукциона выиграл крупные газо...,gazprom,газпром ход аукцион выиграть крупный газовый у...
4,https://www.finam.ru/publications/item/v-gazpr...,Оценивая перспективы инвестиции в акции «Газпр...,gazprom,оценивать перспектива инвестиция акция газпром...


In [3]:
RE_RUSSIAN_TEXT = re.compile("[а-яА-Я]+")
EXCLUDE_PATTERNS = re.compile("[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+")
RE_DERIVED = re.compile("\w+( -|- |-|! - )\w+")

stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()

In [4]:
cleaned_text = []
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    report_page = RE_DERIVED.sub("", row["text"])
    report_page_lst = word_tokenize(report_page)
    tokens = []
    for word_ in filter(RE_RUSSIAN_TEXT.match, report_page_lst):
        if word_ and word_ not in stopwords_ru:
            word_ = word_.strip()
            word_ = morph.normal_forms(word_)[0]
            tokens.append(word_)
    cleaned_text.append(" ".join(tokens))

100%|██████████| 151/151 [00:09<00:00, 15.33it/s]


In [5]:
df["cleaned_texts"] = cleaned_text
df.to_csv("news.csv.zip")

# Cosine

In [6]:
e_topics = pd.read_csv("../topics/e_topic.csv.zip")
s_topics = pd.read_csv("../topics/s_topic.csv.zip")
g_topics = pd.read_csv("../topics/g_topic.csv.zip")
topics_words = pd.concat([e_topics, s_topics, g_topics])
topics_words.head()

Unnamed: 0,topic,word,type
0,Экология,аудит системы,1
1,Экология,взаимодействие,1
2,Экология,внедрение,1
3,Экология,воздействие,1
4,Экология,восстановительные мероприятия,1


In [7]:
re_dict = {}
sheets_dict = {}
base_dict = {}

for idx, row in topics_words.iterrows():
    if not sheets_dict.get(row["topic"], False):
        sheets_dict[row["topic"]] = {}
        base_dict[row["topic"]] = {}
    sheets_dict[row["topic"]][row["word"]] = row["type"]
    base_dict[row["topic"]][row["word"]] = 0

for key, val in sheets_dict.items():
    re_dict[key] = re.compile("(" + "(?!\w)|".join(val.keys()) + "(?!\w))")

In [8]:
df = pd.read_csv("news.csv.zip", index_col=0)
df.head()

Unnamed: 0,url,text,company,cleaned_texts
0,https://www.finam.ru/publications/item/neftega...,Аналитики «Финама» подготовили стратегию по не...,gazprom,аналитик финама подготовить стратегия нефтегаз...
1,https://bonds.finam.ru/news/item/roznichnye-in...,"МОСКВА, 26 июл - РИА Новости/Прайм. Высокий ин...",gazprom,москва июл риа новости/прайм высокий интерес в...
2,https://www.finam.ru/publications/item/v-tsena...,"Цены на газ остаются высокими, хотя плавучие х...",gazprom,цена газ оставаться высокий хотя плавучий хран...
3,https://www.finam.ru/publications/item/gazprom...,«Газпром» в ходе аукциона выиграл крупные газо...,gazprom,газпром ход аукцион выиграть крупный газовый у...
4,https://www.finam.ru/publications/item/v-gazpr...,Оценивая перспективы инвестиции в акции «Газпр...,gazprom,оценивать перспектива инвестиция акция газпром...


In [9]:
df["company"].value_counts()

gazprom    123
mmk         28
Name: company, dtype: int64

In [10]:
topics = []
for text in tqdm(df["cleaned_texts"].tolist()):
    paragraph_dict = copy.deepcopy(base_dict)
    found_words = False
    for key, regex in re_dict.items():
        words = regex.findall(text)
        for word in words:
            paragraph_dict[key][word] = 1
            found_words = True
    res = {}
    if not found_words:
        res = {key: 0 for key in sheets_dict}
        topics.append({**res, **{"max_score": None, "max_topic": None}})
        continue
    for key, vector in sheets_dict.items():
        sim = cosine_similarity(X=[list(vector.values())], Y=[list(paragraph_dict[key].values())])[0][0]
        res[key] = sim

    topics.append(
        {
            **res,
            **{
                "max_score": max(res.values()),
                "max_topic": max(res, key=lambda k: res[k]),
            },
        }
    )

100%|██████████| 151/151 [00:05<00:00, 25.97it/s]


In [11]:
df = df.join(pd.DataFrame(topics))
df.head()

Unnamed: 0,url,text,company,cleaned_texts,Экология,Климат,Энергия,Воздух,Вода,Отходы,...,Обучение и развитие,Оплата труда,Отношения потребителями,Отношения с потребителями,Отношения с работниками,Охрана здоровья,Профсоюзы и коллективные договоры,Трудовые отношения,max_score,max_topic
0,https://www.finam.ru/publications/item/neftega...,Аналитики «Финама» подготовили стратегию по не...,gazprom,аналитик финама подготовить стратегия нефтегаз...,0.190117,0.338062,0.177705,0.0,0.0,0.197386,...,0.2,0.30266,0.374166,0.271448,0.308607,0.244949,0.146647,0.233052,0.374166,Отношения потребителями
0,https://www.finam.ru/publications/item/priobre...,ММК завершила сделку по приобретению шахты им....,mmk,ммк завершить сделка приобретение шахта тихов ...,0.190117,0.338062,0.177705,0.0,0.0,0.197386,...,0.2,0.30266,0.374166,0.271448,0.308607,0.244949,0.146647,0.233052,0.374166,Отношения потребителями
1,https://bonds.finam.ru/news/item/roznichnye-in...,"МОСКВА, 26 июл - РИА Новости/Прайм. Высокий ин...",gazprom,москва июл риа новости/прайм высокий интерес в...,0.0,0.102869,0.0,0.0,0.0,0.0,...,0.0,0.195366,0.2,0.0,0.218218,0.141421,0.0,0.113047,0.248452,Налоги
1,https://www.finam.ru/publications/item/mmk-zav...,ММК завершила сделку по приобретению шахты име...,mmk,ммк завершить сделка приобретение шахта имя ти...,0.0,0.102869,0.0,0.0,0.0,0.0,...,0.0,0.195366,0.2,0.0,0.218218,0.141421,0.0,0.113047,0.248452,Налоги
2,https://www.finam.ru/publications/item/v-tsena...,"Цены на газ остаются высокими, хотя плавучие х...",gazprom,цена газ оставаться высокий хотя плавучий хран...,0.15523,0.154303,0.177705,0.154303,0.0,0.113961,...,0.0,0.0,0.2,0.102598,0.0,0.0,0.0,0.0,0.2,Отношения потребителями


# Expert

In [12]:
label2idx = {topic: i for i, topic in enumerate(sorted(topics_words["topic"].unique()))}
idx2label = {i: topic for i, topic in enumerate(sorted(topics_words["topic"].unique()))}

idx2word = {i: word for i, word in enumerate(topics_words["word"].tolist())}
word2idx = defaultdict(list)

for i, word in enumerate(topics_words["word"].tolist()):
    word2idx[word].append(i)

regex = re.compile("(" + "(?!\w)|".join(topics_words["word"].unique()) + "(?!\w))")

In [13]:
topics_vectors = np.zeros((topics_words["topic"].nunique(), topics_words.shape[0]), dtype=int)

for topic, topic_words in topics_words.groupby("topic")["word"].agg(list).items():
    for word in topic_words:
        for idx in word2idx[word]:
            topics_vectors[label2idx[topic], idx] = 1

In [14]:
matrix = np.zeros(topics_vectors.shape)
matrix.shape

(29, 4414)

In [15]:
def fit(vectors, labels):
    for vector, label in zip(vectors, labels):
        for i in range(matrix.shape[0]):
            if i == label2idx[label]:
                matrix[i] += vector
            else:
                matrix[i] -= vector


def test(vectors, labels):
    total = 0
    for vector, label in zip(vectors, labels):
        res = np.argmax(vector @ matrix.T)
        total += label == idx2label[res]
        if label != idx2label[res]:
            print("expected", label, "result", idx2label[res])
    print("accuracy", total / len(labels))

In [16]:
fit(topics_vectors, list(sorted(topics_words["topic"].unique())))
test(topics_vectors, list(sorted(topics_words["topic"].unique())))

accuracy 1.0


In [17]:
topics = []
for text in tqdm(df["cleaned_texts"].tolist()):
    vector = np.zeros(matrix.shape[1])
    words = regex.findall(text)
    if len(words) == 0:
        topics.append(None)
        continue
    for word in words:
        vector[word2idx[word]] = 1
    res = np.argmax(vector @ matrix.T)
    topics.append(idx2label[res])

100%|██████████| 151/151 [00:06<00:00, 22.91it/s]


In [18]:
df["topic_expert_system"] = topics

# Sentiment

## Dostoevsky

In [20]:
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel

tokenizer = RegexTokenizer()
model = FastTextSocialNetworkModel(tokenizer=tokenizer)



In [21]:
results = model.predict(df["cleaned_texts"].tolist())

In [22]:
df = df.join(pd.DataFrame(results))
df.head()

Unnamed: 0,url,text,company,cleaned_texts,Экология,Климат,Энергия,Воздух,Вода,Отходы,...,Профсоюзы и коллективные договоры,Трудовые отношения,max_score,max_topic,topic_expert_system,neutral,negative,positive,skip,speech
0,https://www.finam.ru/publications/item/neftega...,Аналитики «Финама» подготовили стратегию по не...,gazprom,аналитик финама подготовить стратегия нефтегаз...,0.190117,0.338062,0.177705,0.0,0.0,0.197386,...,0.146647,0.233052,0.374166,Отношения потребителями,Инвестиции и капитальные вложения,0.607673,0.217348,0.103759,0.098089,0.007356
0,https://www.finam.ru/publications/item/priobre...,ММК завершила сделку по приобретению шахты им....,mmk,ммк завершить сделка приобретение шахта тихов ...,0.190117,0.338062,0.177705,0.0,0.0,0.197386,...,0.146647,0.233052,0.374166,Отношения потребителями,Эффективность и производительность,0.607673,0.217348,0.103759,0.098089,0.007356
1,https://bonds.finam.ru/news/item/roznichnye-in...,"МОСКВА, 26 июл - РИА Новости/Прайм. Высокий ин...",gazprom,москва июл риа новости/прайм высокий интерес в...,0.0,0.102869,0.0,0.0,0.0,0.0,...,0.0,0.113047,0.248452,Налоги,Дивиденды и акционеры,0.294225,0.140346,0.27514,0.168867,0.012831
1,https://www.finam.ru/publications/item/mmk-zav...,ММК завершила сделку по приобретению шахты име...,mmk,ммк завершить сделка приобретение шахта имя ти...,0.0,0.102869,0.0,0.0,0.0,0.0,...,0.0,0.113047,0.248452,Налоги,Инвестиции и капитальные вложения,0.294225,0.140346,0.27514,0.168867,0.012831
2,https://www.finam.ru/publications/item/v-tsena...,"Цены на газ остаются высокими, хотя плавучие х...",gazprom,цена газ оставаться высокий хотя плавучий хран...,0.15523,0.154303,0.177705,0.154303,0.0,0.113961,...,0.0,0.0,0.2,Отношения потребителями,Безопасность и охрана труда,0.835494,0.187143,0.087574,0.039649,0.00408


## BERT

In [23]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("blanchefort/rubert-base-cased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("blanchefort/rubert-base-cased-sentiment", return_dict=True)


@torch.no_grad()
def predict(text):
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    # predicted = torch.argmax(predicted, dim=1).numpy()
    return predicted

2023-08-08 16:39:50.710654: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-08 16:39:50.854632: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [29]:
rubert = []
for text in tqdm(df["cleaned_texts"].tolist()):
    rubert.extend(predict(text).tolist())

100%|██████████| 151/151 [01:09<00:00,  2.18it/s]


In [31]:
rubert = pd.DataFrame(rubert, columns=["rubert_neutral", "rubert_positive", "rubert_negative"])

In [32]:
df = df.join(rubert)
df.head()

Unnamed: 0,url,text,company,cleaned_texts,Экология,Климат,Энергия,Воздух,Вода,Отходы,...,max_topic,topic_expert_system,neutral,negative,positive,skip,speech,rubert_neutral,rubert_positive,rubert_negative
0,https://www.finam.ru/publications/item/neftega...,Аналитики «Финама» подготовили стратегию по не...,gazprom,аналитик финама подготовить стратегия нефтегаз...,0.190117,0.338062,0.177705,0.0,0.0,0.197386,...,Отношения потребителями,Инвестиции и капитальные вложения,0.607673,0.217348,0.103759,0.098089,0.007356,0.815578,0.151188,0.033233
0,https://www.finam.ru/publications/item/priobre...,ММК завершила сделку по приобретению шахты им....,mmk,ммк завершить сделка приобретение шахта тихов ...,0.190117,0.338062,0.177705,0.0,0.0,0.197386,...,Отношения потребителями,Эффективность и производительность,0.607673,0.217348,0.103759,0.098089,0.007356,0.815578,0.151188,0.033233
1,https://bonds.finam.ru/news/item/roznichnye-in...,"МОСКВА, 26 июл - РИА Новости/Прайм. Высокий ин...",gazprom,москва июл риа новости/прайм высокий интерес в...,0.0,0.102869,0.0,0.0,0.0,0.0,...,Налоги,Дивиденды и акционеры,0.294225,0.140346,0.27514,0.168867,0.012831,0.82018,0.147689,0.032131
1,https://www.finam.ru/publications/item/mmk-zav...,ММК завершила сделку по приобретению шахты име...,mmk,ммк завершить сделка приобретение шахта имя ти...,0.0,0.102869,0.0,0.0,0.0,0.0,...,Налоги,Инвестиции и капитальные вложения,0.294225,0.140346,0.27514,0.168867,0.012831,0.82018,0.147689,0.032131
2,https://www.finam.ru/publications/item/v-tsena...,"Цены на газ остаются высокими, хотя плавучие х...",gazprom,цена газ оставаться высокий хотя плавучий хран...,0.15523,0.154303,0.177705,0.154303,0.0,0.113961,...,Отношения потребителями,Безопасность и охрана труда,0.835494,0.187143,0.087574,0.039649,0.00408,0.180675,0.067849,0.751475


In [33]:
df.to_csv("precessed_news.csv.zip")