In [46]:
import regex as re
import calendar
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

In [3]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
isw_data = pd.read_csv("..\clean_data\isw.csv")

In [126]:
tg_messages = pd.read_csv("..\clean_data\preprocessed_tg.csv")

In [127]:
tg_messages.head()

Unnamed: 0,time,date,message
0,23:40:45,2023-01-25,ще декілька бпла на лінії фронту у запорізькій...
1,23:23:13,2023-01-25,збиваються цілі на півдні про кожну не пишемо
2,23:09:57,2023-01-25,дніпро робота ппо
3,22:58:42,2023-01-25,загрози ту22м3 на даний момент немає тільки бпла
4,22:41:15,2023-01-25,є збиття бпла по півдню


In [8]:
isw_data.head()

Unnamed: 0,date,title,url,html,main_text
0,24-02-2022,Russia-Ukraine Warning Update: Initial Russian...,https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...","february 24, 3:00 pm est russian president vla..."
1,25-02-2022,Russia-Ukraine Warning Update: Russian Offensi...,https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...",russian forces carried out additional air and ...
2,26-02-2022,Russia-Ukraine Warning Update: Russian Offensi...,https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...",russia has surprisingly failed to gain air sup...
3,27-02-2022,Russia-Ukraine Warning Update: Russian Offensi...,https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...","february 27, 4pm est the russian military has ..."
4,28-02-2022,"Russian Offensive Campaign Assessment, Februar...",https://www.understandingwar.org/backgrounder/...,"<!DOCTYPE html>\r\n\r\n<html dir=""ltr"" lang=""e...","february 28, 3:30pm est the russian military i..."


In [137]:
# test text vectorization 
isw_text_data = isw_data[["date", "main_text"]]

In [10]:
isw_text_data.head()

Unnamed: 0,date,main_text
0,24-02-2022,"february 24, 3:00 pm est russian president vla..."
1,25-02-2022,russian forces carried out additional air and ...
2,26-02-2022,russia has surprisingly failed to gain air sup...
3,27-02-2022,"february 27, 4pm est the russian military has ..."
4,28-02-2022,"february 28, 3:30pm est the russian military i..."


In [11]:
isw_text_data["main_text"][isw_text_data["main_text"].isnull()]

273    NaN
304    NaN
311    NaN
Name: main_text, dtype: object

In [138]:
isw_text_data["main_text"] = isw_text_data["main_text"].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Remove:
1) Stop words.
2) Digits.
3) Non-word and non-whitespace characters.

In [47]:
def to_vector_preprocessing(text, stop_words = []):
    if not stop_words:
        stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    text_array = word_tokenize(re.sub('[\W\s\d]', ' ', text.lower()))
    processed_text = ' '.join( 
            [
            word for word in text_array
            if (len(word) > 2) and (word not in stop_words) 
            ])
    return processed_text

In [14]:
def tfidf_vectorizer(_corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(_corpus)
    sparse_matrix = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
    return sparse_matrix

In [60]:
langs = ['english', 'russian']
stop_words = ['russian'] + list(map(lambda elem: elem.lower(), calendar.month_name))[1:]
for lang in langs:
    stop_words += stopwords.words(lang)

In [140]:
processed_text = isw_text_data["main_text"].apply(lambda row: to_vector_preprocessing(row, stop_words))

In [141]:
sm = tfidf_vectorizer(processed_text.tolist())

In [144]:
sm['date'] = isw_text_data['date']

In [77]:
def plot_top_by_doc(df, n=5):
    fig, ax = plt.subplots(n, figsize=(6, 30))
    for i in range(n):
        df.iloc[i, :].sort_values(ascending=False)[:10].plot.barh(
            ax=ax[i], 
            cmap="jet", 
            title=f"Doc {i}").invert_yaxis()
    plt.subplots_adjust(hspace=0.4)

In [76]:
tg_messages.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,message
time,date,Unnamed: 2_level_1
23:40:45,2023-01-25,ще декілька бпла на лінії фронту у запорізькій...
23:23:13,2023-01-25,збиваються цілі на півдні про кожну не пишемо
23:09:57,2023-01-25,дніпро робота ппо
22:58:42,2023-01-25,загрози ту22м3 на даний момент немає тільки бпла
22:41:15,2023-01-25,є збиття бпла по півдню


In [155]:
tg_processed = tg_messages['message'].apply(lambda row: to_vector_preprocessing(row))

In [156]:
sm_tg = tfidf_vectorizer(tg_processed.tolist())

In [157]:
sm_tg[['date', 'time']] = tg_messages[['date', 'time']]

In [158]:
sm_tg.head()

Unnamed: 0,_bot,afp,agm,ahs,air,akinci,alexander,amev,anpsq,armed,...,їзди,їнки,їхала,їхньому,їхня,їхні,їхніх,ґрунтовних,date,time
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,23:40:45
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,23:23:13
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,23:09:57
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,22:58:42
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-01-25,22:41:15


In [154]:
sm.head()

Unnamed: 0,abachev,abandon,abandoned,abandoning,abandonment,abbreviated,abc,abdollahian,abduct,abducted,...,дивизион,кедр,коридор,набор,окремі,переселения,программа,підрозділи,рбк,сухопутный
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.063386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
print("ISW processed shape: ", sm.shape)
print("Telegram processed shape: ", sm_tg.shape)

ISW processed shape:  (336, 15140)
Telegram processed shape:  (4878, 9298)


In [165]:
sm.merge(sm_tg, how='outer', on="date")[['date', 'time_y']]

Unnamed: 0,date,time_y
0,24-02-2022,
1,25-02-2022,
2,26-02-2022,
3,27-02-2022,
4,28-02-2022,
...,...,...
331,21-01-2023,
332,22-01-2023,
333,23-01-2023,
334,24-01-2023,
