In [17]:
import os
import re
import string
import copy
import math
import glob
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

from num2words import num2words

from bs4 import BeautifulSoup

In [10]:
# Uncomment and process once

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Stepan_Kalika\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Stepan_Kalika\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Stepan_Kalika\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Stepan_Kalika\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Define correction funcs

In [11]:
def remove_one_letter_word(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + ' ' + w                                                                                                                                                                                                                                                                                             
    return new_text

def convert_lower_case(data):
      return np.char.lower(data)

             
def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    stop_stop_words = {"no","not"}
    stop_words = stop_words - stop_stop_words                        

    words = word_tokenize(str(data))

    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text +" "+ w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+—./:;<=>7@[\]^_'{|}~\n"

    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")

    data = np.char.replace(data, ',', "")

    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def remove_map_line(data):
    return np.char.replace(data, "Click here to expand the map below.", "")

def remove_map_line_v2(data):
    return np.char.replace(data, "Click here to see ISW’s interactive map of the Russian invasion of Ukraine. This map is updated daily alongside the static maps present in this report.", "")

def convert_numbers(data):

    tokens = word_tokenize(str(data))
    new_text = " "
    for w in tokens:
        if w.isdigit():
            if int(w)<1000000000000:
                w = num2words (w)
            else:
                w = ''
        new_text = new_text +" " + w
    new_text = np.char.replace(new_text, "-", " ")
                                                
    return new_text

def stemming(data):
    stemmer= PorterStemmer()

    tokens = word_tokenize(str(data))

    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def lemmatizing(data):
    lemmatizer = WordNetLemmatizer()

    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return new_text

In [12]:
def preprocess(data, word_root_algo="lemm"):
    data = remove_map_line(data)
    data = remove_map_line_v2(data)
    data = remove_one_letter_word(data)
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe (data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers (data)

    if word_root_algo == "lemm":
        print ("lennatizing")
        data = lemmatizing(data) #needed again as we need to lemmatize the words
    else:
        print("stemming")
        data = stemming(data) #needed again as we need to stem the words

    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one

    return data

In [13]:
def create_bag_of_words(text):
    """
    Creates a bag of words (a dictionary of word frequencies) from a string.
    """
    # Split the text into words
    words = text.split()

    # Initialize an empty dictionary
    bag_of_words = {}

    # Loop over each word and count its frequency
    for word in words:
        if word in bag_of_words:
            bag_of_words[word] += 1
        else:
            bag_of_words[word] = 1

    return bag_of_words

In [14]:
def calculate_term_frequency(bag_of_words):
   
    # Calculate the total number of words in the bag
    total_words = sum(bag_of_words.values())

    # Initialize an empty dictionary
    term_frequency = {}

    # Loop over each word in the bag and calculate its term frequency
    for word, frequency in bag_of_words.items():
        term_frequency[word] = frequency / total_words

    return term_frequency

In [15]:
INPUT_FOLDER = "Reports"
OUTPUT_FOLDER = "data/2_isw_preprocessed"
OUTPUT_FILE = "all_days.csv"

files_by_days = glob.glob(f"{INPUT_FOLDER}/*.html")

In [19]:
all_data = []

for file in files_by_days:
    name = file.split(".")[0].lstrip(INPUT_FOLDER)
    name = name.replace("\\", "") 
    print(name)  #
    d = {} 
# Open the HTML file
    with open(file, encoding="utf8") as file:
            soup = BeautifulSoup(file, 'html.parser')

            # Extract the text from the HTML
            text = soup.get_text()

            # Find the index of the first occurrence of "ET"
            index = text.find("ET")

            # Extract the text after the first occurrence of "ET"
            text = text[2+index:]
            index = text.rfind("[1]")
            text =text[:index]
            text = re.sub(r'\[.*?\]', '', text)

            lemm = preprocess(text)
            stemm = preprocess(text, "stemm")
                        
            d = {
                "date":name,
                "text":text,
                "lemm":lemm,
                "stemm":stemm
            }
            all_data.append(d)

df = pd.DataFrame.from_dict(all_data)
df = df.sort_values(by = ['date'])
df.head(5)

2022-02-24
lennatizing
stemming
2022-02-25
lennatizing
stemming
2022-02-26
lennatizing
stemming
2022-02-27
lennatizing
stemming
2022-02-28
lennatizing
stemming
2022-03-01
lennatizing
stemming
2022-03-02
lennatizing
stemming
2022-03-03
lennatizing
stemming
2022-03-04
lennatizing
stemming
2022-03-05
lennatizing
stemming
2022-03-06
lennatizing
stemming
2022-03-07
lennatizing
stemming
2022-03-08
lennatizing
stemming
2022-03-09
lennatizing
stemming
2022-03-10
lennatizing
stemming
2022-03-11
lennatizing
stemming
2022-03-12
lennatizing
stemming
2022-03-13
lennatizing
stemming
2022-03-14
lennatizing
stemming
2022-03-15
lennatizing
stemming
2022-03-16
lennatizing
stemming
2022-03-17
lennatizing
stemming
2022-03-18
lennatizing
stemming
2022-03-19
lennatizing
stemming
2022-03-20
lennatizing
stemming
2022-03-21
lennatizing
stemming
2022-03-22
lennatizing
stemming
2022-03-23
lennatizing
stemming
2022-03-24
lennatizing
stemming
2022-03-25
lennatizing
stemming
2022-03-26
lennatizing
stemming
2022-03-

lennatizing
stemming
2022-11-08
lennatizing
stemming
2022-11-09
lennatizing
stemming
2022-11-10
lennatizing
stemming
2022-11-11
lennatizing
stemming
2022-11-12
lennatizing
stemming
2022-11-13
lennatizing
stemming
2022-11-14
lennatizing
stemming
2022-11-15
lennatizing
stemming
2022-11-16
lennatizing
stemming
2022-11-17
lennatizing
stemming
2022-11-18
lennatizing
stemming
2022-11-19
lennatizing
stemming
2022-11-20
lennatizing
stemming
2022-11-21
lennatizing
stemming
2022-11-22
lennatizing
stemming
2022-11-23
lennatizing
stemming
2022-11-25
lennatizing
stemming
2022-11-26
lennatizing
stemming
2022-11-27
lennatizing
stemming
2022-11-28
lennatizing
stemming
2022-11-29
lennatizing
stemming
2022-11-30
lennatizing
stemming
2022-12-01
lennatizing
stemming
2022-12-02
lennatizing
stemming
2022-12-03
lennatizing
stemming
2022-12-04
lennatizing
stemming
2022-12-05
lennatizing
stemming
2022-12-06
lennatizing
stemming
2022-12-07
lennatizing
stemming
2022-12-08
lennatizing
stemming
2022-12-09
lennatiz

Unnamed: 0,date,text,lemm,stemm
0,2022-02-24,(Ukraine local time) that Russian forces have...,ukrain local time russian forc not achiev bre...,ukrain local time russian forc not achiev bre...
1,2022-02-25,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...
2,2022-02-26,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...
3,2022-02-27,\n\n\n\n\n\n\n\n\nRussia-Ukraine Warning Updat...,russia ukrain warn updat russian offens campa...,russia ukrain warn updat russian offen campai...
4,2022-02-28,\n\n\n\n\n\n\n\n\nRussian Offensive Campaign A...,russian offens campaign ass februari twenti e...,russian offen campaign assess februari twenti...


In [21]:
df.to_csv(f"{OUTPUT_FOLDER}/{OUTPUT_FILE}", sep=";", index=False)

In [22]:
docs = df['lemm'].tolist()

In [23]:
len(docs)

401

In [24]:
cv = CountVectorizer(max_df = 0.98, min_df = 2)
word_count_vector = cv.fit_transform(docs)

word_count_vector.shape

(401, 7274)

In [25]:
word_count_vector

<401x7274 sparse matrix of type '<class 'numpy.int64'>'
	with 249532 stored elements in Compressed Sparse Row format>

In [26]:
#model folder needed to be created previously !!!
with open("model/count_vectorizer_v2.pkl", 'wb') as handle:
    pickle.dump(cv, handle)

# TF_IDF


In [27]:
tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_transformer.fit(word_count_vector)

In [28]:
with open("model/tfidf_transformer_v2.pkl", 'wb') as handle:
    pickle.dump(tfidf_transformer, handle)

In [29]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(), columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
luhansk,1.022642
area,1.025190
staff,1.027745
kharkiv,1.027745
advanc,1.027745
...,...
quarrel,5.897840
chukotka,5.897840
quash,5.897840
pyatnashka,5.897840


In [30]:
tf_idf_vector = tfidf_transformer.transform(word_count_vector)

In [31]:
tf_idf_vector

<401x7274 sparse matrix of type '<class 'numpy.float64'>'
	with 249532 stored elements in Compressed Sparse Row format>