In [92]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [93]:
# Uncomment and process once

#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

# Define correction funcs

In [94]:
def remove_one_letter_word(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + ' ' + w                                                                                                                                                                                                                                                                                             
    return new_text

def convert_lower_case(data):
      return np.char.lower(data)

             
def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    stop_stop_words = {"no","not"}
    stop_words = stop_words - stop_stop_words                        

    words = word_tokenize(str(data))

    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text +" "+ w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+—./:;<=>7@[\]^_'{|}~\n"

    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")

    data = np.char.replace(data, ',', "")

    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def remove_map_line(data):
    return np.char.replace(data, "Click here to expand the map below.", "")

def remove_map_line_v2(data):
    return np.char.replace(data, "Click here to see ISW’s interactive map of the Russian invasion of Ukraine. This map is updated daily alongside the static maps present in this report.", "")

def convert_numbers(data):

    tokens = word_tokenize(str(data))
    new_text = " "
    for w in tokens:
        if w.isdigit():
            if int(w)<1000000000000:
                w = num2words (w)
            else:
                w = ''
        new_text = new_text +" " + w
    new_text = np.char.replace(new_text, "-", " ")
                                                
    return new_text

def stemming(data):
    stemmer= PorterStemmer()

    tokens = word_tokenize(str(data))

    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def lemmatizing(data):
    lemmatizer = WordNetLemmatizer()

    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return new_text

In [95]:
def preprocess(data, word_root_algo="lemm"):
    data = remove_map_line(data)
    data = remove_map_line_v2(data)
    data = remove_one_letter_word(data)
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe (data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers (data)

    if word_root_algo == "lemm":
        print ("lennatizing")
        data = lemmatizing(data) #needed again as we need to lemmatize the words
    else:
        print("stemming")
        data = stemming(data) #needed again as we need to stem the words

    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one

    return data

In [96]:
def create_bag_of_words(text):
    """
    Creates a bag of words (a dictionary of word frequencies) from a string.
    """
    # Split the text into words
    words = text.split()

    # Initialize an empty dictionary
    bag_of_words = {}

    # Loop over each word and count its frequency
    for word in words:
        if word in bag_of_words:
            bag_of_words[word] += 1
        else:
            bag_of_words[word] = 1

    return bag_of_words

In [97]:
def calculate_term_frequency(bag_of_words):
   
    # Calculate the total number of words in the bag
    total_words = sum(bag_of_words.values())

    # Initialize an empty dictionary
    term_frequency = {}

    # Loop over each word in the bag and calculate its term frequency
    for word, frequency in bag_of_words.items():
        term_frequency[word] = frequency / total_words

    return term_frequency

In [98]:
INPUT_FOLDER = "Reports"
OUTPUT_FOLDER = "data/2_isw_preprocessed"
OUTPUT_FILE = "all_days.csv"
import glob
files_by_days = glob.glob(f"{INPUT_FOLDER}/*.html")
#files_by_days = glob.glob("01-02-2023.html")

In [99]:
from bs4 import BeautifulSoup
import re

all_data = []

for file in files_by_days:
    name = file.split(".")[0].lstrip(INPUT_FOLDER)
    name = name.replace("\\", "") 
    print(name)  #
    d = {} 
# Open the HTML file
    with open(file, encoding="utf8") as file:
            

            soup = BeautifulSoup(file, 'html.parser')

            # Extract the text from the HTML
            text = soup.get_text()

            # Find the index of the first occurrence of "ET"
            index = text.find("ET")

            # Extract the text after the first occurrence of "ET"
            text = text[2+index:]
            index = text.rfind("[1]")
            text =text[:index]
            text = re.sub(r'\[.*?\]', '', text)
            # Print the text
            #print(text)
            lemm = preprocess(text)
            stemm = preprocess(text, "stemm")
            
            #print(text)
            #sample = create_bag_of_words(text)
            #sample_fr = calculate_term_frequency(sample)
            
            d = {
                "date":name,
                "text":text,
                "lemm":lemm,
                "stemm":stemm
            }
            all_data.append(d)

df = pd.DataFrame.from_dict(all_data)
df = df.sort_values(by = ['date'])
df.head(5)

2022-03-01
lennatizing
stemming
2022-03-02
lennatizing
stemming
2022-03-03
lennatizing
stemming
2022-03-04
lennatizing
stemming
2022-03-05
lennatizing
stemming
2022-03-06
lennatizing
stemming
2022-03-07
lennatizing
stemming
2022-03-08
lennatizing
stemming
2022-03-09
lennatizing
stemming
2022-03-10
lennatizing
stemming
2022-03-11
lennatizing
stemming
2022-03-12
lennatizing
stemming
2022-03-13
lennatizing
stemming
2022-03-14
lennatizing
stemming
2022-03-15
lennatizing
stemming
2022-03-16
lennatizing
stemming
2022-03-17
lennatizing
stemming
2022-03-18
lennatizing
stemming
2022-03-19
lennatizing
stemming
2022-03-20
lennatizing
stemming
2022-03-21
lennatizing
stemming
2022-03-22
lennatizing
stemming
2022-03-23
lennatizing
stemming
2022-03-24
lennatizing
stemming
2022-03-25
lennatizing
stemming
2022-03-26
lennatizing
stemming
2022-03-27
lennatizing
stemming
2022-03-28
lennatizing
stemming
2022-03-29
lennatizing
stemming
2022-03-30
lennatizing
stemming
2022-03-31
lennatizing
stemming
2022-04-

Unnamed: 0,date,text,lemm,stemm
0,2022-03-01,"SNAZ units throughout Kherson City on March 1,...",snaz unit throughout kherson citi march ukrai...,snaz unit throughout kherson citi march ukrai...
1,2022-03-02,"SNAZ units throughout Kherson City on March 1,...",snaz unit throughout kherson citi march inclu...,snaz unit throughout kherson citi march inclu...
2,2022-03-03,\n\n\n\n\n\n\n\n\nRussian Offensive Campaign A...,russian offens campaign ass march institut st...,russian offen campaign assess march institut ...
3,2022-03-04,\n\n\n\n\n\n\n\n\nRussian Offensive Campaign A...,russian offens campaign ass march institut st...,russian offen campaign assess march institut ...
4,2022-03-05,\n\n\n\n\n\n\n\n\nRussian Offensive Campaign A...,russian offens campaign ass march institut st...,russian offen campaign assess march institut ...


In [100]:
df.to_csv(f"{OUTPUT_FOLDER}/{OUTPUT_FILE}", sep=";", index=False)

In [101]:
docs = df['lemm'].tolist()

In [102]:
len(docs)

389

In [103]:
cv = CountVectorizer(max_df = 0.98, min_df = 2)
word_count_vector = cv.fit_transform(docs)

word_count_vector.shape

(389, 7198)

In [104]:
word_count_vector

<389x7198 sparse matrix of type '<class 'numpy.int64'>'
	with 243496 stored elements in Compressed Sparse Row format>

In [105]:
#model folder needed to be created previously !!!
with open("model/count_vectorizer_v2.pkl", 'wb') as handle:
    pickle.dump(cv, handle)

# TF_IDF


In [106]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_transformer.fit(word_count_vector)

In [107]:
with open("model/tfidf_transformer_v2.pkl", 'wb') as handle:
    pickle.dump(tfidf_transformer, handle)

In [108]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(), columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
luhansk,1.020726
kharkiv,1.023347
staff,1.023347
posit,1.025975
advanc,1.025975
...,...
quicken,5.867534
quo,5.867534
comb,5.867534
quarrel,5.867534


In [109]:
tf_idf_vector = tfidf_transformer.transform(word_count_vector)

In [110]:
tf_idf_vector

<389x7198 sparse matrix of type '<class 'numpy.float64'>'
	with 243496 stored elements in Compressed Sparse Row format>