## Initialisation (have to run it every time)

###### ❗Install packages

In [1]:
!pip install openpyxl --upgrade --ignore-installed openpyxl
import pandas as pd
import numpy as np
from google.colab import drive
import torch

# import zipfile

# for text preprocessing
import re
from IPython.core.display import clear_output

# for formatting
from datetime import datetime, timedelta

# for visualization
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openpyxl
  Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 4.5 MB/s 
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10


In [4]:
previews = pd.read_excel('cbr_previews.xlsx').drop('Unnamed: 0', axis=1)
X_rbc = pd.read_excel('only_rbc_news_upd16042022.xlsx').drop('Unnamed: 0', axis=1).dropna().reset_index(drop=True)

### Press release texts preprocessing

In [6]:
import nltk
import string
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")
russian_stopwords.extend(['также', 'это'])

clear_output()

def delete_stop_words(text):
    return ' '.join([word for word in word_tokenize(text) if word not in russian_stopwords])

In [7]:
previews['preprocessed'] = [
       delete_stop_words(
           re.sub(r'[^\w\s]+|[\d]+', r'',
                  previews['text'][i][previews['text'][i].find("годовых")+9:previews['text'][i].find("Следующее ")].lower()
                  ).strip() 
       )
      for i in range(len(previews['text']))
      ]

### News texts preprocessing

In [9]:
X_rbc['preprocessed'] = [
       delete_stop_words(
           re.sub(r'[^\w\s]+|[\d]+', r'',
                  X_rbc['text'][i].lower()
                  ).strip() 
       )
      for i in range(len(X_rbc['text']))
      ]

### Sbert embeddings

In [10]:
!pip install transformers
from transformers import AutoTokenizer, AutoModel
import torch
import time


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
clear_output()

#Tokenize sentences
start_time = time.time()
encoded_input_previews = tokenizer(previews['preprocessed'].tolist(), padding=True, truncation=True, max_length=24, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output_previews = model(**encoded_input_previews)

#Perform pooling. In this case, mean pooling
previews_embeddings = mean_pooling(model_output_previews, encoded_input_previews['attention_mask'])
print(f'Time to transform previews: {(time.time() - start_time)} sec')


# The same for news
start_time = time.time()
encoded_news = tokenizer(X_rbc['preprocessed'].tolist(), padding=True, truncation=True, max_length=24, return_tensors='pt')
with torch.no_grad():
    model_output_news = model(**encoded_news)
news_embeddings = mean_pooling(model_output_news, encoded_news['attention_mask'])
print(f'Time to transform news: {(time.time() - start_time)} sec')

X_rbc['sbert'] = [embeding for embeding in news_embeddings]

# averaging all news for each date
dictionary = dict()
for i in range(len(X_rbc)):
    num = X_rbc['preview_index'][i]
    if type(dictionary.get(num, 0))==torch.Tensor:
        dictionary[num] = torch.cat([dictionary[num], X_rbc['sbert'][i].reshape(1,-1)],dim=0)
    else:
        dictionary[num] =  X_rbc['sbert'][i].reshape(1,-1)

for i in range(68):
    dictionary[i] = torch.mean(dictionary.get(i), dim=0)

# adding the averaged news embeddings to the previews DataFrame - for easy access
previews['avg_news_sbert'] = [dictionary[i] for i in range(len(previews))]
news_embeddings_avg = torch.cat([prev.reshape(1,-1) for prev in previews['avg_news_sbert'].to_list()], dim=0)

# saving the embeddings for further use
torch.save(previews_embeddings, 'cb_1306022_sbert.pt')
torch.save(news_embeddings_avg, 'news_rbc_13062022_sbert.pt')

Time to transform previews: 18.33488178253174 sec
Time to transform news: 535.8603291511536 sec


### DeepPavlov

In [12]:
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
clear_output()

#Tokenize sentences
start_time = time.time()
encoded_input_previews = tokenizer(previews['preprocessed'].tolist(), padding=True, truncation=True, max_length=24, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output_previews = model(**encoded_input_previews)

#Perform pooling. In this case, mean pooling
previews_embeddings = mean_pooling(model_output_previews, encoded_input_previews['attention_mask'])
print(f'Time to transform previews: {(time.time() - start_time)} sec')


# The same for news
start_time = time.time()
encoded_news = tokenizer(X_rbc['preprocessed'].tolist(), padding=True, truncation=True, max_length=24, return_tensors='pt')
with torch.no_grad():
    model_output_news = model(**encoded_news)
news_embeddings = mean_pooling(model_output_news, encoded_news['attention_mask'])
print(f'Time to transform news: {(time.time() - start_time)} sec')

X_rbc['deeppavlov'] = [embeding for embeding in news_embeddings]

# averaging all news for each date
dictionary = dict()
for i in range(len(X_rbc)):
    num = X_rbc['preview_index'][i]
    if type(dictionary.get(num, 0))==torch.Tensor:
        dictionary[num] = torch.cat([dictionary[num], X_rbc['deeppavlov'][i].reshape(1,-1)],dim=0)
    else:
        dictionary[num] =  X_rbc['deeppavlov'][i].reshape(1,-1)

for i in range(68):
    dictionary[i] = torch.mean(dictionary.get(i), dim=0)

# adding the averaged news embeddings to the previews DataFrame - for easy access
previews['avg_news_deeppavlov'] = [dictionary[i] for i in range(len(previews))]
news_embeddings_avg = torch.cat([prev.reshape(1,-1) for prev in previews['avg_news_deeppavlov'].to_list()], dim=0)

# saving the embeddings for further use
torch.save(previews_embeddings, 'cb_1306022_deeppavlov.pt')
torch.save(news_embeddings_avg, 'news_rbc_13062022_deeppavlov.pt')

Time to transform previews: 7.568722486495972 sec
Time to transform news: 157.73328828811646 sec
