In [1]:
import re
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px


import nltk
import pymorphy2
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sklearn.preprocessing as preprocessing

import warnings 
warnings.filterwarnings("ignore")

In [2]:
import plotly.io as pio
pio.renderers.default = "notebook"

# Intro

We will explore our propagandistic data deeper and will make `TF-IDF` vectorization on messages text. 

In [2]:
PATH = r"data/data.csv" 
data = pd.read_csv(PATH)
data.drop(["Unnamed: 0", "date", "reactions", "to_id", "msg_entity"], axis=1, inplace=True)

In [3]:
_data = data.copy()
_data["datetime"] = pd.to_datetime(_data["datetime"])

na_messages = _data[_data["message"].isna()]
_data.drop(na_messages.index, inplace=True)

In [4]:
_data.head()

Unnamed: 0,id,views,fwd_from,message,type,duration,channel,frw_from_title,frw_from_name,datetime,message_len,reactions_dict,reactions_num,_from_id,_to_id,sensitive-topic,toxicity
0,189123.0,98413.0,,–§–¢–° –†–æ—Å—Å–∏–∏ –æ–∂–∏–¥–∞–µ—Ç —Ä–æ—Å—Ç–∞ —Ç–æ–≤–∞—Ä–æ–æ–±–æ—Ä–æ—Ç–∞ —Å –ö–∏—Ç–∞–µ...,text,,rian_ru,,,2022-12-19 09:56:04+00:00,205,[],0,,1101170442,politics,neutral
4,189119.0,118174.0,,–ë—É—ç–Ω–æ—Å-–ê–π—Ä–µ—Å –Ω–∞—É—Ç—Ä–æ –ø–æ—Å–ª–µ –ø—Ä–∞–∑–¥–Ω–∏–∫–∞,video,10.0,rian_ru,,,2022-12-19 09:51:57+00:00,35,[],0,,1101170442,none,neutral
6,189117.0,224975.0,,"–í –°–ö —Å–æ–æ–±—â–∏–ª–∏, —á—Ç–æ –∂–∏–∑–Ω–∏ —Ä–∞–±–æ—á–∏—Ö, –ø–æ—Å—Ç—Ä–∞–¥–∞–≤—à–∏—Ö...",photo,,rian_ru,,,2022-12-19 09:10:44+00:00,141,[],0,,1101170442,offline_crime,neutral
7,189116.0,226171.0,,–°–∞–º–æ–ª–µ—Ç —Å –ø–æ—Å—Ç—Ä–∞–¥–∞–≤—à–∏–º –ø—Ä–∏ –ø–æ–∫—É—à–µ–Ω–∏–∏ –≥–ª–∞–≤–æ–π –†—É...,video,30.0,rian_ru,,,2022-12-19 09:09:39+00:00,116,[],0,,1101170442,"offline_crime,politics",neutral
8,189115.0,256663.0,,–ù–æ—Ä–≤–µ–∂—Å–∫–∏–π –∫–æ—Ä–æ–ª—å –•–∞—Ä–∞–ª—å–¥ V (85 –ª–µ—Ç) –≥–æ—Å–ø–∏—Ç–∞–ª–∏...,text,,rian_ru,,,2022-12-19 08:50:57+00:00,116,[],0,,1101170442,health_shaming,neutral


In [5]:
_data.shape

(7016265, 17)

In [6]:
messages = _data[["datetime", "message"]]
messages.head()

Unnamed: 0,datetime,message
0,2022-12-19 09:56:04+00:00,–§–¢–° –†–æ—Å—Å–∏–∏ –æ–∂–∏–¥–∞–µ—Ç —Ä–æ—Å—Ç–∞ —Ç–æ–≤–∞—Ä–æ–æ–±–æ—Ä–æ—Ç–∞ —Å –ö–∏—Ç–∞–µ...
4,2022-12-19 09:51:57+00:00,–ë—É—ç–Ω–æ—Å-–ê–π—Ä–µ—Å –Ω–∞—É—Ç—Ä–æ –ø–æ—Å–ª–µ –ø—Ä–∞–∑–¥–Ω–∏–∫–∞
6,2022-12-19 09:10:44+00:00,"–í –°–ö —Å–æ–æ–±—â–∏–ª–∏, —á—Ç–æ –∂–∏–∑–Ω–∏ —Ä–∞–±–æ—á–∏—Ö, –ø–æ—Å—Ç—Ä–∞–¥–∞–≤—à–∏—Ö..."
7,2022-12-19 09:09:39+00:00,–°–∞–º–æ–ª–µ—Ç —Å –ø–æ—Å—Ç—Ä–∞–¥–∞–≤—à–∏–º –ø—Ä–∏ –ø–æ–∫—É—à–µ–Ω–∏–∏ –≥–ª–∞–≤–æ–π –†—É...
8,2022-12-19 08:50:57+00:00,–ù–æ—Ä–≤–µ–∂—Å–∫–∏–π –∫–æ—Ä–æ–ª—å –•–∞—Ä–∞–ª—å–¥ V (85 –ª–µ—Ç) –≥–æ—Å–ø–∏—Ç–∞–ª–∏...


In [7]:
def to_vector_preprocessing(text, stop_words = []):
    if not stop_words:
        stop_words = stopwords.words("english")
#     morph = pymorphy2.MorphAnalyzer()
#     morph.parse(word)[0].normal_form
#     stemmer = PorterStemmer()
#     stemmer.stem(word)
    text_array = word_tokenize(re.sub('[\W\s\d]', ' ', text.lower()))
    processed_text = ' '.join( 
            [word for word in text_array
            if (len(word) > 2) and (word not in stop_words) 
            ])
    return processed_text

In [8]:
def tfidf_vectorizer(_corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(_corpus)
    sparse_matrix = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
    return sparse_matrix

In [9]:
stop_words = [
    "–∏", "–≤", "–≤–æ", "–Ω–µ", "—á—Ç–æ", "–æ–Ω", "–Ω–∞", "—è", "—Å", "—Å–æ", "–∫–∞–∫", "–∞", "—Ç–æ", "–≤—Å–µ", "–æ–Ω–∞", "—Ç–∞–∫", "–µ–≥–æ", 
    "–Ω–æ", "–¥–∞", "—Ç—ã", "–∫", "—É", "–∂–µ", "–≤—ã", "–∑–∞", "–±—ã", "–ø–æ", "—Ç–æ–ª—å–∫–æ", "–µ–µ", "–º–Ω–µ", "–±—ã–ª–æ", "–≤–æ—Ç", "–æ—Ç", 
    "–º–µ–Ω—è", "–µ—â–µ", "–Ω–µ—Ç", "–æ", "–∏–∑", "–µ–º—É", "—Ç–µ–ø–µ—Ä—å", "–∫–æ–≥–¥–∞", "–¥–∞–∂–µ", "–Ω—É", "–≤–¥—Ä—É–≥", "–ª–∏", "–µ—Å–ª–∏", "—É–∂–µ", 
    "–∏–ª–∏", "–Ω–∏", "–±—ã—Ç—å", "–±—ã–ª", "–Ω–µ–≥–æ", "–¥–æ", "–≤–∞—Å", "–Ω–∏–±—É–¥—å", "–æ–ø—è—Ç—å", "—É–∂", "–≤–∞–º", "–≤–µ–¥—å", "—Ç–∞–º", "–ø–æ—Ç–æ–º", 
    "—Å–µ–±—è", "–Ω–∏—á–µ–≥–æ", "–µ–π", "–º–æ–∂–µ—Ç", "–æ–Ω–∏", "—Ç—É—Ç", "–≥–¥–µ", "–µ—Å—Ç—å", "–Ω–∞–¥–æ", "–Ω–µ–π", "–¥–ª—è", "–º—ã", "—Ç–µ–±—è", "–∏—Ö", 
    "—á–µ–º", "–±—ã–ª–∞", "—Å–∞–º", "—á—Ç–æ–±", "–±–µ–∑", "–±—É–¥—Ç–æ", "—á–µ–≥–æ", "—Ä–∞–∑", "—Ç–æ–∂–µ", "—Å–µ–±–µ", "–ø–æ–¥", "–∂–∏–∑–Ω—å", "–≤–ø—Ä–æ—á–µ–º", 
    "—Ö–æ—Ä–æ—à–æ", "–≤—Å—é", "—ç—Ç–∏", "—Ç–æ–≥–¥–∞", "–±—ã–ª–∏", "—Ç–∞", "–±—ã–≤–∞–µ—Ç", "–ª—É—á—à–µ", "—ç—Ç–æ", "http"]
langs = ['russian']
for lang in langs:
    stop_words += stopwords.words(lang)
stop_words = set(stop_words)

Because we have a large number of text data, we will evaluate the time for preprocessing and also get a sample 
(`frac=0.1`) due to the limited computation resources.

In [10]:
%%time
n_test = 1000 
processed_text = messages["message"][:n_test].map(lambda row: to_vector_preprocessing(row, stop_words))

Wall time: 204 ms


In [11]:
print(f"The number of words in first {n_test} messages:\t {sum(messages['message'][:n_test].map(len))}")

The number of words in first 1000 messages:	 199967


Evaluation of computation for sample:

    287559504 x 204 / (199967 * 60000) ‚âà 4.88 min

In [12]:
messages_10_per = messages.sample(frac=0.1)
messages_10_per.head()

Unnamed: 0,datetime,message
1449181,2022-07-26 16:47:16+00:00,–ü–æ–ª—å—Å–∫–∏–π —Ü–µ–Ω—Ç—Ä —è–∑—ã–∫–∞ –ø—Ä–æ–∫–æ–º–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–ª –º–Ω–æ–≥–æ—á–∏—Å...
6691776,2022-09-22 14:54:28+00:00,"–ß–µ—Å—Ç–Ω–æ –≥–æ–≤–æ—Ä—è, –Ω–µ –æ–∂–∏–¥–∞–ª —Ç–∞–∫–æ–≥–æ —Ä–∞–∑–≤–æ—Ä–æ—Ç–∞ —Å–æ–±—ã..."
2519768,2018-07-16 15:24:45+00:00,"–¢—Ä–∞–º–ø: ""–Ø –ø—Ä–æ–¥–æ–ª–∂–∞—é —Ç—Ä–∞–¥–∏—Ü–∏–∏ –¥–∏–ø–ª–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–≥–æ ..."
4316816,2021-02-16 09:03:52+00:00,üî´ –ü–æ–¥—Ä–æ—Å—Ç–∫–∏ —Ä–µ—à–∏–ª–∏ –ø–æ—Å—Ç—Ä–µ–ª—è—Ç—å –∏–∑ –ø–∏—Å—Ç–æ–ª–µ—Ç–∞ —Å—Ä–µ...
7203281,2020-03-05 05:41:47+00:00,"–ë–µ–∑–æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω–æ —Å–∞–º–æ–≥–æ —Å–∞–π—Ç–∞, –≤–µ—Å—å–º–∞ –ø–æ—É—á–∏—Ç–µ–ª—å..."


In [13]:
messages_10_per.shape

(701626, 2)

In [14]:
messages_10_per_1 = messages_10_per.sort_values(by="datetime")
messages_10_per_1.head()

Unnamed: 0,datetime,message
5282279,2015-09-24 19:35:13+00:00,–ú–Ω–µ –Ω—É–∂–Ω–∞ –≤–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è –≤–∞—à–µ–π –ª—é–±–≤–∏
1920194,2015-09-29 05:34:30+00:00,–ü—Ä–æ–±–ª–µ–º—ã Wi-Fi –≤ –º–æ—Å–∫–æ–≤—Å–∫–æ–º –º–µ—Ç—Ä–æ —Å—Ç–∞–ª–∏ —Ç–µ–º–æ–π ...
1920193,2015-09-29 16:09:16+00:00,–ü–µ—Ä–≤—ã–π –∏—Å–∫ –≤ –ú–æ—Å–≥–æ—Ä—Å—É–¥ –æ –ø–æ–∂–∏–∑–Ω–µ–Ω–Ω–æ–π –±–ª–æ–∫–∏—Ä–æ–≤–∫...
1920190,2015-09-30 05:33:00+00:00,–†–∞—Å—Å–º–æ—Ç—Ä–µ–Ω–∏–µ –¥–µ–ª –æ–± —ç–∫—Å—Ç—Ä–µ–º–∏–∑–º–µ –ø–æ–¥–Ω–∏–º—É—Ç –Ω–∞ –≤—ã...
1920188,2015-09-30 07:44:23+00:00,–ü—É—Ç–∏–Ω –ø–æ–ø—Ä–æ—Å–∏–ª –°–æ–≤—Ñ–µ–¥ —Ä–∞–∑—Ä–µ—à–∏—Ç—å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ ...


In [15]:
print(f"The number of words in sampled data:\t {sum(messages_10_per_1['message'].map(len))}")

The number of words in sampled data:	 287559504


In [16]:
%%time
processed_mes = messages_10_per_1["message"].map(lambda row: to_vector_preprocessing(row, stop_words))

Wall time: 3min 22s


In [17]:
processed_mes_1 = pd.DataFrame(processed_mes).set_index(messages_10_per_1["datetime"])
processed_mes_1

Unnamed: 0_level_0,message
datetime,Unnamed: 1_level_1
2015-09-24 19:35:13+00:00,–Ω—É–∂–Ω–∞ –≤–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è –≤–∞—à–µ–π –ª—é–±–≤–∏
2015-09-29 05:34:30+00:00,–ø—Ä–æ–±–ª–µ–º—ã –º–æ—Å–∫–æ–≤—Å–∫–æ–º –º–µ—Ç—Ä–æ —Å—Ç–∞–ª–∏ —Ç–µ–º–æ–π –∏–≥—Ä—ã izv...
2015-09-29 16:09:16+00:00,–ø–µ—Ä–≤—ã–π –∏—Å–∫ –º–æ—Å–≥–æ—Ä—Å—É–¥ –ø–æ–∂–∏–∑–Ω–µ–Ω–Ω–æ–π –±–ª–æ–∫–∏—Ä–æ–≤–∫–µ –ø–æ...
2015-09-30 05:33:00+00:00,—Ä–∞—Å—Å–º–æ—Ç—Ä–µ–Ω–∏–µ –¥–µ–ª —ç–∫—Å—Ç—Ä–µ–º–∏–∑–º–µ –ø–æ–¥–Ω–∏–º—É—Ç –≤—ã—Å—à–∏–π —É...
2015-09-30 07:44:23+00:00,–ø—É—Ç–∏–Ω –ø–æ–ø—Ä–æ—Å–∏–ª —Å–æ–≤—Ñ–µ–¥ —Ä–∞–∑—Ä–µ—à–∏—Ç—å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ ...
...,...
2022-12-26 08:06:48+00:00,–ø–æ–∑–¥—Ä–∞–≤–ª–µ–Ω–∏–µ –¥–µ–¥–∞ –º–æ—Ä–æ–∑–∞ —Å–Ω–µ–≥—É—Ä–æ—á–∫–∏ —Å—Ç–∞–ª–∏ –¥–æ—Ä–æ...
2022-12-26 08:09:12+00:00,–¥–µ–π—Å—Ç–≤–∏—è –∫–æ–º–∞–Ω–¥—É—é—â–µ–≥–æ –≥—Ä—É–ø–ø–∏—Ä–æ–≤–∫–æ–π —É–∫—Ä–∞–∏–Ω–µ —Å—É—Ä...
2022-12-26 09:07:38+00:00,—Å—à–∞ –±—É–¥—É—Ç –ø–æ–≤—ã—à–∞—Ç—å —Å—Ç–∞–≤–∫–∏ –ø—Ä–µ–¥–µ–ª–∞ –≤–æ–µ–Ω–Ω—ã–π —ç–∫—Å–ø...
2022-12-26 10:26:21+00:00,–º–∏–Ω–∏—Å—Ç–µ—Ä—Å—Ç–≤–æ –æ–±–æ—Ä–æ–Ω—ã —Ä–µ—Å–ø—É–±–ª–∏–∫–∏ –∫–æ—Ä–µ—è —Å–æ–æ–±—â–∞–µ—Ç...


We also add some threshhold (`threshhold=50`) preprocessing to omit unvaluable messages with few words.

In [18]:
threshhold = 50
processed_mes_2 = processed_mes_1[processed_mes_1["message"].apply(len) > threshhold]
processed_mes_2

Unnamed: 0_level_0,message
datetime,Unnamed: 1_level_1
2015-09-29 05:34:30+00:00,–ø—Ä–æ–±–ª–µ–º—ã –º–æ—Å–∫–æ–≤—Å–∫–æ–º –º–µ—Ç—Ä–æ —Å—Ç–∞–ª–∏ —Ç–µ–º–æ–π –∏–≥—Ä—ã izv...
2015-09-29 16:09:16+00:00,–ø–µ—Ä–≤—ã–π –∏—Å–∫ –º–æ—Å–≥–æ—Ä—Å—É–¥ –ø–æ–∂–∏–∑–Ω–µ–Ω–Ω–æ–π –±–ª–æ–∫–∏—Ä–æ–≤–∫–µ –ø–æ...
2015-09-30 05:33:00+00:00,—Ä–∞—Å—Å–º–æ—Ç—Ä–µ–Ω–∏–µ –¥–µ–ª —ç–∫—Å—Ç—Ä–µ–º–∏–∑–º–µ –ø–æ–¥–Ω–∏–º—É—Ç –≤—ã—Å—à–∏–π —É...
2015-09-30 07:44:23+00:00,–ø—É—Ç–∏–Ω –ø–æ–ø—Ä–æ—Å–∏–ª —Å–æ–≤—Ñ–µ–¥ —Ä–∞–∑—Ä–µ—à–∏—Ç—å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ ...
2015-09-30 08:42:28+00:00,–∞–º–µ—Ä–∏–∫–∞–Ω—Å–∫–æ–º —à—Ç–∞—Ç–µ –¥–∂–æ—Ä–¥–∂–∏—è –≤–ø–µ—Ä–≤—ã–µ –ª–µ—Ç –∫–∞–∑–Ω–∏–ª...
...,...
2022-12-26 08:06:48+00:00,–ø–æ–∑–¥—Ä–∞–≤–ª–µ–Ω–∏–µ –¥–µ–¥–∞ –º–æ—Ä–æ–∑–∞ —Å–Ω–µ–≥—É—Ä–æ—á–∫–∏ —Å—Ç–∞–ª–∏ –¥–æ—Ä–æ...
2022-12-26 08:09:12+00:00,–¥–µ–π—Å—Ç–≤–∏—è –∫–æ–º–∞–Ω–¥—É—é—â–µ–≥–æ –≥—Ä—É–ø–ø–∏—Ä–æ–≤–∫–æ–π —É–∫—Ä–∞–∏–Ω–µ —Å—É—Ä...
2022-12-26 09:07:38+00:00,—Å—à–∞ –±—É–¥—É—Ç –ø–æ–≤—ã—à–∞—Ç—å —Å—Ç–∞–≤–∫–∏ –ø—Ä–µ–¥–µ–ª–∞ –≤–æ–µ–Ω–Ω—ã–π —ç–∫—Å–ø...
2022-12-26 10:26:21+00:00,–º–∏–Ω–∏—Å—Ç–µ—Ä—Å—Ç–≤–æ –æ–±–æ—Ä–æ–Ω—ã —Ä–µ—Å–ø—É–±–ª–∏–∫–∏ –∫–æ—Ä–µ—è —Å–æ–æ–±—â–∞–µ—Ç...


In [19]:
%%time

top_words_by_message = []
for idx in range(processed_mes_2.shape[0]):
    sm_idx = tfidf_vectorizer(processed_mes_2.iloc[idx].tolist())
    sm_idx.index = [processed_mes_2.index[idx]]
    top_words_by_message.append(sm_idx)

Wall time: 16min 43s


In [20]:
len(top_words_by_message)

615878

Besides, we will resample data for 3 months period for convenient interpretation of results and RAM limit if the 
the period will be too high:

    slicing_threshhold_3M = the number of messages / data period in months = 615878 / (7 x 4) ‚âà 22000  
    
*Note:* we resample data in the average number of messages for 3 months and thus periods can actually
be inequal to define.  

In [158]:
%%time

counter, slicing_threshhold = 0, 22000 # 3M period
top_words_by_3M = []
df_3M = defaultdict(float)
for row_idx in tqdm(range(len(top_words_by_message))):
    counter += 1
    current_message = top_words_by_message[row_idx].iloc[0].to_dict()
    for key, item in current_message.items():
        df_3M[key] += item
    if counter > slicing_threshhold:
        top_words_by_3M.append((top_words_by_message[row_idx - counter + 1].index, df_3M))
        df_3M = defaultdict(float)
        counter = 0

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 615878/615878 [04:45<00:00, 2155.99it/s]

Wall time: 4min 45s





In [159]:
%%time

top_30_words_3M = []
for row_idx in tqdm(range(len(top_words_by_3M))):
    top_30_words_3M.append((top_words_by_3M[row_idx][0], 
                            sorted(top_words_by_3M[row_idx][1].items(), key=lambda item: item[1], reverse=True)[:30]))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27/27 [00:05<00:00,  4.70it/s]

Wall time: 5.75 s





In [160]:
def plot_results_idx(top_words_by_period, idx):
    words_series = pd.Series(dict(top_words_by_period[idx][1]))
    return px.bar(words_series.reset_index().rename(columns={0: "value"}),
       x="index", 
       y="value", 
       color="value",
       color_continuous_scale='Agsunset').data

In [161]:
from plotly.subplots import make_subplots

titles = []
for row_idx in range(len(top_30_words_3M) - 1):
    start = top_30_words_3M[row_idx][0].strftime("%Y-%m-%d").values[0]
    end = top_30_words_3M[row_idx + 1][0].strftime("%Y-%m-%d").values[0]
    titles.append(f"Top words from {start} to {end}")
    
fig = make_subplots(rows=len(top_30_words_3M), cols=1, subplot_titles=titles)

In [162]:
for row_idx in range(len(top_30_words_3M)):
    for trace in plot_results_idx(top_30_words_3M, row_idx):
        fig.add_trace(trace, row=row_idx + 1, col=1)
fig.update_layout(height=8000, width=1000, 
                  showlegend=False)
fig.show()

From the cumulative `TF-IDF` text processing plots above, there is the overall tendency of significance russia, putin, the main russian regions, and viral events during the entire period.<br>
There are also more significant mentions of the USA from the start of coronavirus to now.<br>
From the beginning of the war in 2022-02, territories, Ukraine, defense, etc., have been more important. 