In [52]:
import pandas as pd
import numpy as np
import torch
from time import time, sleep
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')
from scipy.spatial.distance import cdist

import spacy


In [25]:
# !pip install -U pip setuptools wheel
# !pip install -U 'spacy[apple]'
# !pip install --upgrade spacy
# !pip uninstall en_core_web_sm, en_core_web_sm, pt_core_news_sm, ru_core_news_sm
# sleep(1)
# clear_output()

In [26]:
UKR_CHANNELS = [
    'Труха⚡️Украина', 'Лачен пишет', 'Украинская правда. Главное',
    'Вы хотите как на Украине?', 'Борис Філатов', 'RAGNAROCK PRIVET',
    'УНИАН - новости Украины | война с Россией | новини України | війна з Росією',
    'Украина 24/7 Новости | Война | Новини', 'Быть Или',
    'Украина Сейчас: новости, война, Россия'
]

UKR_LETTERS = ['ї', 'є', 'ґ', 'і']

CHEAT_WORDS = [
    '03', '04', '05', '1378', '2022', '3801', '3806', '4149', '4276',
    '4279', '9521', '9842', 'akimapachev', 'amp', 'anna', 'com',
    'daily', 'diza', 'donbass', 'epoddubny', 'https', 'index', 'me',
    'news', 'opersvodki', 'pravda', 'rus', 'rvvoenkor', 'sashakots',
    'ua', 'wargonzo', 'www', 'www pravda', 'мид', 'труха', 'труха украина',
    'украина сейчас', 'pravda com', 'daily news', 'com ua', 'https www',
    'me rvvoenkor', 'rus news', 'ua rus', 'wargonzo наш'
]

In [27]:
def time_decorator(function):
    from time import time
    def inner(*args, **kwargs):
        start = time()
        result = function(*args, **kwargs)
        elapsed_time = round(time() - start, 2)
        output = f'{function.__name__} took {elapsed_time} seconds.'
        print(output)
        return result
    return inner

In [28]:
@time_decorator
def read_data(filename='random_msgs.csv', sep='¶∆',
                  header=None):
    """
    Reads the csv file into 4 columns:
    channel
    date of publication
    message
    ukrainian - 1 if ukrainian channel, 0 - otherwise.
    """
    data = pd.read_csv(filename, sep=sep, header=header)
    data.columns = ['channel', 'date', 'msg']
    data['ukrainian'] = data['channel'].\
        apply(lambda x: 1 if x in UKR_CHANNELS else 0)
    data['ukrainian'] = data['ukrainian'].astype('int8')
    data = data[data['channel'] != 'вечеряємо']
    return data

In [29]:
data = read_data('random_msgs.csv')
data.sample(5)

read_data took 1.16 seconds.


Unnamed: 0,channel,date,msg,ukrainian
7198,Операция Z: Военкоры Русской Весны,2022-04-18 21:52:53+00:00,‼️🇷🇺Бои глазами экипажа БТР морской пехоты в М...,0
30134,Труха⚡️Украина,2022-05-13 19:00:56+00:00,Военная аналитика от Тома Купера. В сегодняшн...,1
36511,Труха⚡️Украина,2022-04-03 19:30:21+00:00,🔞 Первые минуты после обстрела в одном из райо...,1
142021,КОРНИЛОВ,2022-03-10 08:36:31+00:00,"Боже, ЧТО ОНО НЕСЕТ!!!! https://t.me/skabeeva/...",0
108714,СОЛОВЬЁВ,2022-05-24 11:10:22+00:00,🟪 ЛАБИРИНТ КАРНАУХОВА | СОЛОВЬЁВ LIVE Сергей ...,0


In [30]:
@time_decorator
def preprocess(data, remove_ukr_msgs=True, cut_less_than=18):
    """
    This method:
    removes short messages (with less than 18 characters);
    removes messages with ukrainian letters.
    """
    if remove_ukr_msgs:
        for letter in UKR_LETTERS:
            data = data[data['msg'].str.lower().\
                                    str.contains(letter) == False]
    data = data[data['msg'].str.len() > cut_less_than]
    data = data.reset_index(drop=True)
    return data

In [31]:
data = preprocess(data)
data.shape

preprocess took 1.08 seconds.


(138059, 4)

# Russian embeddings

### Spacy

In [32]:
nlp = spacy.load("ru_core_news_md")
spacy.__version__

'3.5.3'

In [33]:
# !python3 -m spacy download ru_core_news_md
# sleep(1)
# clear_output()

In [34]:
nlp.pipe_names

['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [35]:
for pipe in nlp.pipe_names[1:]:
    nlp.remove_pipe(pipe)
nlp.pipe_names

['tok2vec']

In [36]:
doc = nlp('Я тебя люблю но и ненавижу одновременно')
doc.tensor.shape, doc.vector.shape

((7, 96), (300,))

**This cell runs approx 2-3 mins**

In [37]:
start = time()
# vectors = np.array([sent.vector for sent in data['msg'].map(nlp)])
# tensors = torch.from_numpy(vectors)
# torch.save(tensors, 'word_tensors.pt')
tensors = torch.load('word_tensors.pt')
f'{time() - start:.4f}'

'0.0739'

In [38]:
tensors[1234][:10]

tensor([ 4.8718e-02, -1.0175e-01,  3.7081e-02,  4.2260e-02,  5.9182e-02,
         2.4718e-02,  4.4870e-02, -3.5494e-02, -4.1296e-02,  1.6422e-05])

In [39]:
data.iloc[1234]

channel                                     Оперативные сводки
date                                 2022-04-18 05:07:24+00:00
msg          ⚡️ На этих уникальных кадрах очередная украинс...
ukrainian                                                    0
Name: 1234, dtype: object

In [40]:
data.to_csv('processed.csv', index=False)

In [41]:
data = pd.read_csv('processed.csv')
data.shape

(138059, 4)

In [42]:
data.sample(2)

Unnamed: 0,channel,date,msg,ukrainian
103082,СОЛОВЬЁВ,2022-05-13 16:43:03+00:00,📞🇷🇺🇺🇸 13 мая с.г по инициативе американской ст...,0
24425,Лачен пишет,2022-03-04 15:47:23+00:00,В военной администрации Кривого Рога провели б...,1


In [47]:
labels = torch.from_numpy(data['ukrainian'].values)
tensors.shape, labels.shape

(torch.Size([138059, 300]), torch.Size([138059]))

In [48]:
X_train, X_test = tensors[:110_000], tensors[110_000:]
y_train, y_test = labels[:110_000], labels[110_000:]

In [60]:
ukr_centre = X_train[y_train == 1].mean(0).view(1, -1)
rus_centre = X_train[y_train == 0].mean(0).view(1, -1)
ukr_centre[0, :5], rus_centre[0, :5]

(tensor([ 0.0182, -0.1146, -0.0090,  0.0096,  0.0541]),
 tensor([ 0.0204, -0.1096, -0.0073,  0.0082,  0.0596]))

In [62]:
dist_to_ukr = cdist(ukr_centre, X_test, metric='euclidean')[0]
dist_to_rus = cdist(rus_centre, X_test, metric='euclidean')[0]

In [63]:
ukr_pred = np.array([dist_to_ukr < dist_to_rus]).reshape((-1, 1))

In [65]:
ukr_test = np.array(y_test).astype(bool).reshape((-1, 1))
accuracy = round(100 * np.sum(ukr_pred == ukr_test) / len(ukr_test), 2)
accuracy

53.97