In [1]:
# for tokenize
import nltk
import pandas as pd
import numpy as np
import re
import json
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
# for lemmatization
import spacy
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv('data.csv')
#df

In [3]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [4]:
def lemmatize(report):
    report = report.lower()
    text = re.sub(r'\d+', '', report)
    text = re.sub(r'[^\w\s]', '', text)

    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    lemmatizer = nltk.WordNetLemmatizer()
    pos_tags = nltk.pos_tag(filtered_tokens)
    lemmatized_tokens = []
    for token, tag in pos_tags:
        wordnet_tag = get_wordnet_pos(tag)
        if wordnet_tag is None:
            lemmatized_tokens.append(token)
        else:
            lemmatized_tokens.append(lemmatizer.lemmatize(token, pos=wordnet_tag))
    return lemmatized_tokens

In [6]:
df['lemmatized_text'] = df['main_text'].apply(lemmatize)
df

Unnamed: 0,date,title,full_url,main_html,main_text,lemmatized_text
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,/backgrounder/russia-ukraine-warning-update-in...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna ...","[mason, clark, george, barros, kateryna, stepa..."
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Stepa...","[mason, clark, george, barros, kateryna, stepa..."
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Katya Stepanen...","[mason, clark, george, barros, katya, stepanen..."
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...","[mason, clark, george, barros, kateryna, stepa..."
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Ste...","[mason, clark, george, barros, kateryna, stepa..."
...,...,...,...,...,...,...
325,2023-01-21,"Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Karolina Hird, Grace Mappes, Angela Howard, ...","[karolina, hird, grace, mappes, angela, howard..."
326,2023-01-22,"Russian Offensive Campaign Assessment, Januar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Russian Offensive Campaign Assessment, Januar...","[russian, offensive, campaign, assessment, jan..."
327,2023-01-23,"Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Russian Offensive Campaign Assessment, Januar...","[russian, offensive, campaign, assessment, jan..."
328,2023-01-24,"Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Karolina Hird, Riley Bailey, Grace Mappes, G...","[karolina, hird, riley, bailey, grace, mappes,..."


In [29]:
df[['date','lemmatized_text']].to_json('data.json', orient='records', lines=True)

In [7]:
# TF-IDF

lemmatized_text_str = df['lemmatized_text'].apply(lambda x: ' '.join(x))
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(lemmatized_text_str)
feature_names = tfidf_vectorizer.get_feature_names_out()


tfidf_dict_list = []

for i, row in enumerate(lemmatized_text_str):
    tfidf_dict = {}
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    for j in range(len(feature_names)):
        if j in feature_index:
            tfidf_dict[feature_names[j]] = tfidf_matrix[i, j]
        else:
            tfidf_dict[feature_names[j]] = 0
    tfidf_dict_list.append(tfidf_dict)

df['tf-idf'] = tfidf_dict_list

In [8]:
len(feature_names)

14720

In [9]:
print(df['tf-idf'].iloc[0])



In [10]:
df

Unnamed: 0,date,title,full_url,main_html,main_text,lemmatized_text,tf-idf
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,/backgrounder/russia-ukraine-warning-update-in...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna ...","[mason, clark, george, barros, kateryna, stepa...","{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Stepa...","[mason, clark, george, barros, kateryna, stepa...","{'abachev': 0, 'abandon': 0.028046167518242994..."
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Katya Stepanen...","[mason, clark, george, barros, katya, stepanen...","{'abachev': 0, 'abandon': 0.02043425393431761,..."
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...","[mason, clark, george, barros, kateryna, stepa...","{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Ste...","[mason, clark, george, barros, kateryna, stepa...","{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
...,...,...,...,...,...,...,...
325,2023-01-21,"Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Karolina Hird, Grace Mappes, Angela Howard, ...","[karolina, hird, grace, mappes, angela, howard...","{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
326,2023-01-22,"Russian Offensive Campaign Assessment, Januar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Russian Offensive Campaign Assessment, Januar...","[russian, offensive, campaign, assessment, jan...","{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
327,2023-01-23,"Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Russian Offensive Campaign Assessment, Januar...","[russian, offensive, campaign, assessment, jan...","{'abachev': 0, 'abandon': 0.006953892330079333..."
328,2023-01-24,"Russian Offensive Campaign Assessment, January...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Karolina Hird, Riley Bailey, Grace Mappes, G...","[karolina, hird, riley, bailey, grace, mappes,...","{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."


In [11]:
result_df = df[['date','tf-idf']]
result_df

Unnamed: 0,date,tf-idf
0,2022-02-24,"{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
1,2022-02-25,"{'abachev': 0, 'abandon': 0.028046167518242994..."
2,2022-02-26,"{'abachev': 0, 'abandon': 0.02043425393431761,..."
3,2022-02-27,"{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
4,2022-02-28,"{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
...,...,...
325,2023-01-21,"{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
326,2023-01-22,"{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."
327,2023-01-23,"{'abachev': 0, 'abandon': 0.006953892330079333..."
328,2023-01-24,"{'abachev': 0, 'abandon': 0, 'abandoned': 0, '..."


In [13]:
result_df.to_csv('result_data.csv', index=False)

In [14]:
def get_sparse_matrix(report):
    sentences = sent_tokenize(report)
    lemmatized_sentences = [lemmatize(sentence) for sentence in sentences]
    
    unique_words = set()
    for sentence in lemmatized_sentences:
        unique_words.update(sentence)
    
    list_of_token_lists = [' '.join(sentence_tokens) for sentence_tokens in lemmatized_sentences]
    
    vectorizer = CountVectorizer()
    matrix = vectorizer.fit_transform(list_of_token_lists)
    
    return matrix, vectorizer.get_feature_names_out()

In [15]:
report = df['main_text'].iloc[0]
matrix, feature_names = get_sparse_matrix(report)
matrix_to_array = matrix.toarray()
df_matrix = pd.DataFrame(matrix_to_array, columns=feature_names)
df_matrix

Unnamed: 0,ability,access,achieve,across,additional,additionally,advance,aggression,aim,air,...,wear,week,west,western,willing,withdrawal,would,yet,zelensky,zone
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
67,0,0,0,0,0,0,0,0,0,2,...,1,0,0,0,0,0,0,0,0,0
68,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
69,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


For saving sparse matrix

In [16]:
# with open('sparse_matrix.pkl', 'wb') as f:
#     pickle.dump(matrix, f)

# with open('feature_names.json', 'w') as f:
#     json.dump(feature_names.tolist(), f)