# ISW preprocessing

### Install packages

In [34]:
# to be uncommented if dependencies are not installed

# %pip install nltk num2words scikit-learn pandas numpy python-dotenv

### Import and download all dependecies

In [35]:
import pandas as pd
import numpy as np
import nltk
import string
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from num2words import num2words
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import pandas as pd
import string
from zipfile import ZipFile, ZipInfo
from pathlib import Path
import os

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lap2r\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lap2r\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lap2r\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lap2r\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Define preprocessing functions

In [36]:
# Functions
def to_lower_case(text):
  return "".join([i.lower() for i in text])

stop_punctuation = string.punctuation
def remove_punctuation(text):
  return "".join([i for i in text if i not in stop_punctuation])

def remove_long_dash(text):
  return re.sub(r'—', ' ', text)

def remove_urls(text):
  return re.sub(r'http\S+', '', text)

def remove_one_letter_words(tokens):
  return list(filter(lambda token: len(token) > 1, tokens))

def tokenize_text(text):
  return nltk.tokenize.word_tokenize(text)

stop_words = set(nltk.corpus.stopwords.words('english'))
frequent_words = {'russian', 'force', 'forces', 'ukrainian', 'ukraine', 'oblast' 'ukraine', 'military', 'reported', 'effort', 'likely',
                  'claimed', 'russia', 'area', 'operation', 'continued', 'city', 'general', 'near', 'attack',
                  'official', 'staff', 'also', 'stated', 'source', 'oblast', 'pm', 'am'}
month_names = {'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'}
# avoid_stop_words = {"not", "n't", "no"}
# stop_words = stop_words - avoid_stop_words
stop_words = stop_words.union(frequent_words)
stop_words = stop_words.union(month_names)

def remove_stop_words(tokens):
  return [i for i in tokens if i not in stop_words]

def do_stemming(tokens):
  ps = nltk.PorterStemmer()
  return [ps.stem(word) for word in tokens]

def do_lemmatization(tokens):
  wn = nltk.WordNetLemmatizer()
  return [wn.lemmatize(word) for word in tokens]

def remove_numeric_words(text):
  return re.sub(r'\S*\d+\S*', '', text)

def convert_nums_to_words(data):
  tokens = data
  new_text = []
  for word in tokens:
    if word.isdigit():
      if int(word)<1000000000:
        word = num2words(word)
      else: 
        word = ""
    new_text.extend(tokenize_text(re.sub("(-|,\s?)|\s+", " ", word)))
  return new_text

def do_preprocessing(data):
  text_clean = data
  text_clean = remove_urls(text_clean)
  text_clean = remove_punctuation(text_clean)
  text_clean = remove_long_dash(text_clean)
  text_clean = to_lower_case(text_clean)
  text_clean = remove_numeric_words(text_clean)
  words = tokenize_text(text_clean)
  words = remove_one_letter_words(words)
  words = remove_stop_words(words)
  lemmatized = do_lemmatization(words)
  res = convert_nums_to_words(lemmatized)
  return res

### Zip opening

In [37]:
# use env config
from dotenv import load_dotenv
load_dotenv("../.env")

# specifying the input folder
folder_name = "../" + os.getenv("ISW_SCRAPPING_FODLER")

df = pd.DataFrame(columns=["Name", "Date", "Text"])

df_list = []

print("Reading folder contents")
for root, dirs, files in os.walk(folder_name):
    for filename in files:
        if filename.endswith('.txt'):
            with open(os.path.join(root, filename), encoding='utf-8') as file:
                name = filename.split('.')[0]
                date = filename.replace("assessment-", "")
                text = file.read()
                row_df = pd.DataFrame({"Name": [name], "Date": [date], "Text": [text]})
                df_list.append(row_df)
df = pd.concat(df_list, ignore_index=True)
print("Successfully read the input data")

Reading folder contents
Successfully read the input data


### TF-IDF creation

In [38]:
print("Find tokens")          
df["Tokens"] = df["Text"].apply(lambda d: " ".join(do_preprocessing(d)))

# To be uncommented if you want to see the most common words
#
# print("Find most common words")
#
# all_words = []
# for tokens in df["Tokens"]:
#   for word in tokens.split(" "):
#     all_words.append(word)
# all_words = nltk.FreqDist(all_words)
# print("Top 30 frequenty used words: ")
# print(all_words.most_common(30))


filenames = df["Name"]
dates = df["Date"]

print("Create vectors")
tfidf = TfidfVectorizer(smooth_idf=True,use_idf=True)
vectors = tfidf.fit_transform(df["Tokens"])

# store content
with open("results/tfidf.pkl", "wb") as handle:
  pickle.dump(tfidf, handle)

feature_names = tfidf.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
dictionaries = df.to_dict(orient='records')

print("Into result")
res = __builtins__.zip(filenames, dates, dictionaries)
res_df = pd.DataFrame(res, columns=["Name","Date","Keywords"])
res_df["Keywords"] = res_df["Keywords"].apply(lambda d: {k: v for k, v in d.items() if v > 0})
res_df["Keywords"] = res_df["Keywords"].apply(lambda d: dict(sorted(d.items(), key=lambda item: item[1], reverse=True)))

res_df

Find tokens
Create vectors
Into result


Unnamed: 0,Name,Date,Keywords
0,assessment-2022-02-24,2022-02-24.txt,"{'airport': 0.2989225615551494, 'kyiv': 0.2142..."
1,assessment-2022-02-25,2022-02-25.txt,"{'kyiv': 0.3623260669568973, 'local': 0.189735..."
2,assessment-2022-02-26,2022-02-26.txt,"{'kyiv': 0.4540653793502749, 'zaprozhia': 0.15..."
3,assessment-2022-02-28,2022-02-28.txt,"{'kyiv': 0.3075720489357905, 'asset': 0.176497..."
4,assessment-2022-03-01,2022-03-01.txt,"{'kyiv': 0.374212983451302, 'chernihiv': 0.233..."
...,...,...,...
394,assessment-2023-03-29,2023-03-29.txt,"{'moskalev': 0.20967599808875628, 'wagner': 0...."
395,assessment-2023-03-30,2023-03-30.txt,"{'csto': 0.23584684478345108, 'khodakovsky': 0..."
396,assessment-2023-03-31,2023-03-31.txt,"{'lukashenko': 0.2557744590950341, 'antiwester..."
397,assessment-2023-04-01,2023-04-01.txt,"{'conscription': 0.22151586733175985, 'offensi..."
