In [1]:
import regex as re
import calendar
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

In [4]:
isw_data = pd.read_csv("..\clean_data\isw.csv")

In [5]:
tg_messages = pd.read_csv("..\clean_data\preprocessed_tg.csv")

In [6]:
tg_messages.head()

Unnamed: 0,time,date,message
0,23:40:45,2023-01-25,ще декілька бпла на лінії фронту у запорізькій...
1,23:23:13,2023-01-25,збиваються цілі на півдні про кожну не пишемо
2,23:09:57,2023-01-25,дніпро робота ппо
3,22:58:42,2023-01-25,загрози ту22м3 на даний момент немає тільки бпла
4,22:41:15,2023-01-25,є збиття бпла по півдню


In [7]:
isw_data.head()

Unnamed: 0,date,title,url,html,main_text
0,24-02-2022,Russia-Ukraine Warning Update: Initial Russian...,https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...","february 24, 3:00 pm est russian president vla..."
1,25-02-2022,Russia-Ukraine Warning Update: Russian Offensi...,https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...",russian forces carried out additional air and ...
2,26-02-2022,Russia-Ukraine Warning Update: Russian Offensi...,https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...",russia has surprisingly failed to gain air sup...
3,27-02-2022,Russia-Ukraine Warning Update: Russian Offensi...,https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...","february 27, 4pm est the russian military has ..."
4,28-02-2022,"Russian Offensive Campaign Assessment, Februar...",https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...","february 28, 3:30pm est the russian military i..."


In [8]:
# test text vectorization 
isw_text_data = isw_data[["date", "main_text"]]

In [9]:
isw_text_data.head()

Unnamed: 0,date,main_text
0,24-02-2022,"february 24, 3:00 pm est russian president vla..."
1,25-02-2022,russian forces carried out additional air and ...
2,26-02-2022,russia has surprisingly failed to gain air sup...
3,27-02-2022,"february 27, 4pm est the russian military has ..."
4,28-02-2022,"february 28, 3:30pm est the russian military i..."


In [10]:
isw_text_data["main_text"][isw_text_data["main_text"].isnull()]

273    NaN
304    NaN
311    NaN
Name: main_text, dtype: object

In [55]:
isw_text_data = isw_text_data.fillna('')

Remove:
1) Stop words.
2) Digits.
3) Non-word and non-whitespace characters.

In [12]:
def to_vector_preprocessing(text, stop_words = []):
    if not stop_words:
        stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    text_array = word_tokenize(re.sub('[\W\s\d]', ' ', text.lower()))
    processed_text = ' '.join( 
            [
            word for word in text_array
            if (len(word) > 2) and (word not in stop_words) 
            ])
    return processed_text

In [13]:
def tfidf_vectorizer(_corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(_corpus)
    sparse_matrix = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
    return sparse_matrix

In [14]:
langs = ['english', 'russian']
stop_words = ['russian'] + list(map(lambda elem: elem.lower(), calendar.month_name))[1:]
for lang in langs:
    stop_words += stopwords.words(lang)

In [15]:
processed_text = isw_text_data["main_text"].apply(lambda row: to_vector_preprocessing(row, stop_words))

In [16]:
sm = tfidf_vectorizer(processed_text.tolist())

In [17]:
sm['date'] = isw_text_data['date']

In [18]:
def plot_top_by_doc(df, n=5):
    fig, ax = plt.subplots(n, figsize=(6, 30))
    for i in range(n):
        df.iloc[i, :].sort_values(ascending=False)[:10].plot.barh(
            ax=ax[i], 
            cmap="jet", 
            title=f"Doc {i}").invert_yaxis()
    plt.subplots_adjust(hspace=0.4)

In [19]:
tg_messages.head()

Unnamed: 0,time,date,message
0,23:40:45,2023-01-25,ще декілька бпла на лінії фронту у запорізькій...
1,23:23:13,2023-01-25,збиваються цілі на півдні про кожну не пишемо
2,23:09:57,2023-01-25,дніпро робота ппо
3,22:58:42,2023-01-25,загрози ту22м3 на даний момент немає тільки бпла
4,22:41:15,2023-01-25,є збиття бпла по півдню


In [115]:
tg_processed = tg_messages['message'].apply(lambda row: to_vector_preprocessing(row))

In [116]:
sm_tg = tfidf_vectorizer(tg_processed.tolist())

In [117]:
sm_tg[['date', 'time']] = tg_messages[['date', 'time']]

In [118]:
sm_tg.head()

Unnamed: 0,_bot,afp,agm,ahs,air,akinci,alexander,amev,anpsq,armed,...,їзди,їнки,їхала,їхньому,їхня,їхні,їхніх,ґрунтовних,date,time
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,23:40:45
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,23:23:13
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,23:09:57
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,22:58:42
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,22:41:15


In [52]:
sm_tg.sort_values(by='date', ascending=True)

Unnamed: 0,_bot,afp,agm,ahs,air,akinci,alexander,amev,anpsq,armed,...,їзди,їнки,їхала,їхньому,їхня,їхні,їхніх,ґрунтовних,date,time
4877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-04-29,14:24:57
4876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-04-29,20:14:53
4874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-04-30,19:51:35
4875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-04-30,11:13:45
4873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-05-01,21:47:53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,11:09:16
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,10:55:46
31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,10:00:35
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,19:07:49


In [24]:
sm.head()

Unnamed: 0,abachev,abandon,abandoned,abandoning,abandonment,abbreviated,abc,abdollahian,abduct,abducted,...,дивизион,кедр,коридор,набор,окремі,переселения,программа,підрозділи,рбк,сухопутный
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.063386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
print("ISW processed shape: ", sm.shape)
print("Telegram processed shape: ", sm_tg.shape)

ISW processed shape:  (336, 15128)
Telegram processed shape:  (4878, 9300)


In [133]:
sm["date"] = pd.to_datetime(sm["date"], format='%d-%m-%Y')
sm_tg["date"] = pd.to_datetime(sm_tg["date"], format='%Y-%m-%d')
sm_tg["time"] = pd.to_datetime(sm_tg["time"], format='%H:%M:%S').dt.hour

In [132]:
sm_tg["time"].dt.hour

0       23
1       23
2       23
3       22
4       22
        ..
4873    21
4874    19
4875    11
4876    20
4877    14
Name: time, Length: 4878, dtype: int64

In [134]:
merged_sm = sm.merge(sm_tg, how='outer', on="date")

In [135]:
merged_sm = merged_sm.set_index(['date', 'time_y'])

In [136]:
merged_sm = merged_sm.fillna(0)

In [140]:
sm_index = merged_sm.index.to_frame(index=False)
time_index = sm_index['time_y']
time_index_mean = time_index[time_index.notna()].mean()

In [150]:
merged_sm.index = pd.MultiIndex.from_frame(sm_index.fillna(time_index_mean), names=['date', 'time'])

In [151]:
merged_sm

Unnamed: 0_level_0,Unnamed: 1_level_0,abachev,abandon,abandoned,abandoning,abandonment,abbreviated,abc,abdollahian,abduct,abducted,...,їжа,їзд,їзди,їнки,їхала,їхньому,їхня,їхні,їхніх,ґрунтовних
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022-02-24,12.204592,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-02-25,12.204592,0.0,0.0,0.063386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-02-26,12.204592,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-02-27,12.204592,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-02-28,12.204592,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-25,12.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-01-25,12.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-01-25,11.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-01-25,10.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
