# Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cdist
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from pymystem3 import Mystem
import plotly.express as px
from time import time
from IPython.display import clear_output
from sys import getsizeof

import pandas as pd
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, random_split

SEED = 1003
generator = torch.Generator().manual_seed(SEED)

import warnings
warnings.filterwarnings('ignore')

# General

In [2]:
UKR_CHANNELS = [
    'Труха⚡️Украина', 'Лачен пишет', 'Украинская правда. Главное',
    'Вы хотите как на Украине?', 'Борис Філатов', 'RAGNAROCK PRIVET',
    'УНИАН - новости Украины | война с Россией | новини України | війна з Росією',
    'Украина 24/7 Новости | Война | Новини', 'Быть Или',
    'Украина Сейчас: новости, война, Россия'
]

UKR_LETTERS = ['ї', 'є', 'ґ', 'і']

CHEAT_WORDS = [
    '03', '04', '05', '1378', '2022', '3801', '3806', '4149', '4276',
    '4279', '9521', '9842', 'akimapachev', 'amp', 'anna', 'com',
    'daily', 'diza', 'donbass', 'epoddubny', 'https', 'index', 'me',
    'news', 'opersvodki', 'pravda', 'rus', 'rvvoenkor', 'sashakots',
    'ua', 'wargonzo', 'www', 'www pravda', 'мид', 'труха', 'труха украина',
    'украина сейчас', 'pravda com', 'daily news', 'com ua', 'https www',
    'me rvvoenkor', 'rus news', 'ua rus', 'wargonzo наш'
]

In [3]:
def time_decorator(function):
    from time import time
    def inner(*args, **kwargs):
        start = time()
        result = function(*args, **kwargs)
        elapsed_time = round(time() - start, 2)
        output = f'{function.__name__} took {elapsed_time} seconds.'
        print(output)
        return result
    return inner

# Data Reading

In [4]:
@time_decorator
def read_data(filename='random_msgs.csv', sep='¶∆',
                  header=None):
    """
    Reads the csv file into 4 columns:
    channel
    date of publication
    message
    ukrainian - 1 if ukrainian channel, 0 - otherwise.
    """
    data = pd.read_csv(filename, sep=sep, header=header)
    data.columns = ['channel', 'date', 'msg']
    data['ukrainian'] = data['channel'].\
        apply(lambda x: 1 if x in UKR_CHANNELS else 0)
    data['ukrainian'] = data['ukrainian'].astype('int8')
    data = data[data['channel'] != 'вечеряємо']
    return data

In [5]:
data = read_data('random_msgs.csv')
data.sample(5)

read_data took 0.64 seconds.


Unnamed: 0,channel,date,msg,ukrainian
18031,Zvezdanews,2022-03-18 09:35:46+00:00,⚡️Беседу Путина с Шольцем вряд ли можно назват...,0
48869,🇷🇺 МИГ 🌍,2022-02-24 16:48:23+00:00,"Надо называть вещи своими именами - это не ""не...",0
123905,Вы хотите как на Украине?,2022-03-30 14:43:40+00:00,"⚡️В Белом доме сообщили, что Байден проведет т...",1
128482,Kotsnews,2022-04-08 08:31:06+00:00,Хиты деревенских библиотек. “Украина. Хронолог...,0
136038,ZERGULIO🇷🇺,2022-05-05 05:04:36+00:00,"46 лет и такой идиот, трагедия просто https://...",0


In [6]:
percent_ukr = data['ukrainian'].mean()
percent_rus = 1 - percent_ukr
print("Ukrainian media data percentage:",percent_ukr)
print("Russian media data percentage:",percent_rus)

Ukrainian media data percentage: 0.4295310285326924
Russian media data percentage: 0.5704689714673076


# Preprocessing (removal of Ukrainian language posts, removal of short posts)

In [7]:
@time_decorator
def preprocess(data, remove_ukr_msgs=True, cut_less_than=18):
    """
    This method:
    removes short messages (with less than 18 characters);
    removes messages with ukrainian letters.
    """
    if remove_ukr_msgs:
        for letter in UKR_LETTERS:
            data = data[data['msg'].str.lower().\
                                    str.contains(letter) == False]
    data = data[data['msg'].str.len() > cut_less_than]
    data = data.reset_index(drop=True)
    return data

In [8]:
data = preprocess(data)
data.shape

preprocess took 1.04 seconds.


(138059, 4)

In [9]:
percent_ukr = data['ukrainian'].mean()
percent_rus = 1 - percent_ukr
print("Ukrainian media data percentage:",percent_ukr)
print("Russian media data percentage:",percent_rus)

Ukrainian media data percentage: 0.4014515533214061
Russian media data percentage: 0.598548446678594


# Lemmatizing

In [10]:
@time_decorator
def lemmatize(data, *sentences):
    """
    This method has 2 usages:
    internal; i.e. to lemmatize all messages in the dataset. Runs about 2.5
    minutes.
    outside; to lemmatize a given sequence of sentences.
    """
    mystem = Mystem()
    if not sentences:
        def preprocess_text(text):
            tokens = mystem.lemmatize(text.lower())
            text = " ".join(tokens)
            return text

        data['msg'] = data['msg'].apply(preprocess_text)
        return data
    else:
        result = []
        for sentence in sentences:
            tokens = mystem.lemmatize(sentence.lower())
            result.append(' '.join(tokens))
        return result


In [None]:
data = lemmatize(data)

# Train, test split
**80, 10, 10**

In [None]:
def train_val_test_split(data, random_state=1, train_size=.8):
    """
    This method clones scikit-learn train_test_split.
    """
    X_train, X_rest, ukr_train, ukr_rest, channel_train, channel_rest = \
    train_test_split(
        data['msg'], data['ukrainian'], data['channel'],
        random_state=random_state, train_size=train_size
    )
    
    X_val, X_test, ukr_val, ukr_test, channel_val, channel_test = \
    train_test_split(
        X_rest, ukr_rest, channel_rest,
        random_state=random_state, train_size=.5
    )
    
    return X_train, X_val, X_test, ukr_train.values, ukr_val.values, ukr_test.values, channel_train, channel_val, channel_test

In [None]:
X_train, X_val, X_test, ukr_train, ukr_val, ukr_test, channel_train, channel_val, channel_test = train_val_test_split(data)

In [None]:
X_train.shape, X_val.shape, X_test.shape

In [None]:
type(X_train)

# Vectorization

In [None]:
@time_decorator
def vectorize(X_train, ngram_range=(1,1), sublinear_tf=True, binary=False):
    """
    This method creates a pipeline of CountVectorizer() and TfidfTransformer().
    If CountVectorizer is needed - use count_transform method.
    If TfidfVectorizer is needed - just call a tfidf_transform method.
    """
    tfidf = Pipeline([
                ('vect', CountVectorizer(binary=binary, ngram_range=ngram_range)),
                ('tfidf', TfidfTransformer(sublinear_tf=sublinear_tf))
            ]).fit(X_train)
    
    vect = tfidf['vect']
    return tfidf, vect

In [None]:
tfidf, vect = vectorize(X_train, ngram_range=(1,2), sublinear_tf=True, binary=True)

# Transform

In [None]:
@time_decorator
def tfidf_transform(tfidf, *X):
    """
    Applies TfidfTransform to data.
    """
    return [tfidf.transform(x).T for x in X]


In [None]:
X_train_tfidf, X_val_tfidf, X_test_tfidf = tfidf_transform(tfidf, X_train, X_val, X_test)
X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape

In [None]:
@time_decorator
def count_transform(vect, *X):
    """
    Applies CountTransform to data.
    """
    return [vect.transform(x).T for x in X]

In [None]:
X_train_count, X_val_count, X_test_count = count_transform(vect, X_train, X_val, X_test)
X_train_count.shape, X_val_count.shape, X_test_count.shape

# Remove cheat words

or preprocessor.count_transform() if CountVectorizer model is needed.

In [None]:
@time_decorator
def remove_cheat_words(X_train, X_val, X_test, channel_train, vectorizer, method='manual', freq_pivot=.5,
                           cheat_words=CHEAT_WORDS):
    """
    Removes cheat_words, like channel tags, social media links or
    authors names.
    """
    if method == 'manual':
        delete_mask = np.zeros(X_train.shape[0], dtype=bool)
        delete_mask[np.isin(np.array(
                vectorizer.get_feature_names_out()), cheat_words)
        ] = True
        X_train = X_train.T[:, ~delete_mask].T
        X_val = X_val.T[:, ~delete_mask].T
        X_test = X_test.T[:, ~delete_mask].T
        cheat_words = np.array(vectorizer.get_feature_names_out() ).T[delete_mask]

    else:
        delete_mask = np.zeros(X_train.shape[0], dtype=bool)
        for channel in channel_train.unique():
            arr = X_train.T[channel_train == channel]
            delete_mask |= np.array((np.sum(arr > 0, axis=0) / arr.shape[0]) > .5)[0]

        X_train = X_train.T[:, ~delete_mask].T
        X_val = X_val.T[:, ~delete_mask].T
        X_test = X_test.T[:, ~delete_mask].T
        delete_mask = delete_mask
        cheat_words = np.array(
            vectorizer.get_feature_names_out()
        ).T[delete_mask]
        
    return X_train, X_val, X_test, delete_mask    

In [None]:
X_train, X_val, X_test, mask = remove_cheat_words(X_train_tfidf, X_val_tfidf, X_test_tfidf, channel_train, tfidf)

In [None]:
X_train.shape, X_val.shape, X_test.shape

# LSA

Now, let's say we want to predict whether the sentence belongs to ukrainian social media or russian.

In [25]:
sentence = 'Все пленные с "Азовстали" содержатся в ДНР, \
их будет судить трибунал на территории республики — глава ДНР Денис Пушилин.'

Firstly, need to lemmatiza and transform our sentence.

In [26]:
lemmatized = lemmatize(None, sentence)
vectorizer = tfidf
transformed = vectorizer.transform(lemmatized)[:, ~mask].asfptype().T
transformed.shape

lemmatize took 0.58 seconds.


(1433997, 1)

be carefull; this line runs approx. 1 to 60 minutes, depending on k.

In [27]:
@time_decorator
def train_LSA(X_train, ukr_train, k=150):
    Terms, S, Documents = svds(X_train, k=k)
    ukr_centre = np.array([np.mean(Documents.T[ukr_train == 1], axis=0)])
    rus_centre = np.array([np.mean(Documents.T[ukr_train == 0], axis=0)])
    return Terms, S, Documents, ukr_centre, rus_centre

In [28]:
K = 300
Terms, S, Documents, ukr_centre, rus_centre = train_LSA(X_train, ukr_train, k=K)

train_LSA took 127.5 seconds.


In [29]:
Terms.shape, S.shape, Documents.shape

((1433997, 300), (300,), (300, 110447))

# Neural Net with PCA

In [30]:
class DataSet(Dataset):
    
    @time_decorator
    def __init__(self, X, y):
        global Terms, S, Documents
        self.X = torch.from_numpy(np.diag(1 / S) @ Terms.T @ X).T
        self.y = torch.from_numpy(y).float()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [31]:
train_dataset = DataSet(X_train, ukr_train)
val_dataset = DataSet(X_val, ukr_val)
test_dataset = DataSet(X_test, ukr_test)

__init__ took 130.5 seconds.
__init__ took 178.94 seconds.
__init__ took 128.45 seconds.


In [32]:
train = DataLoader(train_dataset, batch_size=1_000, shuffle=True)
val = DataLoader(val_dataset, batch_size=1_000, shuffle=True)
test = DataLoader(test_dataset, batch_size=1_000, shuffle=True)

In [33]:
class Model(nn.Module):
    
    def __init__(self, dropout=.25):
        super().__init__()
        global K
        self.dropout = nn.Dropout(dropout) 
        self.fc1 = nn.Linear(K, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 16)
        self.bn2 = nn.BatchNorm1d(16)
        self.fc3 = nn.Linear(16, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x.float()))
        x = self.bn1(x)
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.bn2(x)
        x = self.dropout(x)
        x = F.sigmoid(self.fc3(x)).view(-1)
        return x

In [34]:
model = Model()
L = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)

In [35]:
def train_one_epoch(epoch_index):
    global train, L, optimizer, model
    train_loss = 0.

    for i, (x, labels) in enumerate(train):
        out = model(x)
        loss = L(out, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    return train_loss / (i + 1)

In [36]:
EPOCHS = 50

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    
    model.train()
    train_loss = train_one_epoch(epoch)
    

    model.eval()
    val_loss = 0.0
    for i, (x, labels) in enumerate(val):
        out = model(x)
        vloss = L(out, labels)
        val_loss += vloss.item()
    
    x, labels = val_dataset[:]
    out = model(x)
    
    if epoch % (EPOCHS // 10) == 0:
        print('EPOCH {}:'.format(epoch + 1))
        print(f'{100 * (out.round() == labels).float().mean().item():.1f}%')
        print('LOSS train {:.3f} valid {:.3f}'.format(train_loss, val_loss / (i + 1)))

EPOCH 1:
82.7%
LOSS train 0.500 valid 0.403
EPOCH 6:
84.0%
LOSS train 0.353 valid 0.341
EPOCH 11:
84.0%
LOSS train 0.341 valid 0.345
EPOCH 16:
84.6%
LOSS train 0.336 valid 0.331
EPOCH 21:
84.8%
LOSS train 0.332 valid 0.331
EPOCH 26:
85.0%
LOSS train 0.328 valid 0.326
EPOCH 31:
85.1%
LOSS train 0.325 valid 0.325
EPOCH 36:
84.9%
LOSS train 0.325 valid 0.328
EPOCH 41:
85.0%
LOSS train 0.323 valid 0.328
EPOCH 46:
85.2%
LOSS train 0.322 valid 0.325


In [37]:
torch.save(model.state_dict(), 'DL_model_on_TF-IDF_PCA.pt')

In [None]:
del train_dataset, val_dataset, test_dataset