# ISW preprocessing

### Import and download all dependecies

In [None]:
import pandas as pd
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer

from text_preprocessing import (do_preprocessing)

### Reading target files

In [None]:
from paths_full import *

df = pd.DataFrame(columns=["Name", "Date", "Text"])

df_list = []

print("Reading folder contents")
for root, dirs, files in os.walk(ISW_SCRAPPING_FOLDER):
    for filename in files:
        if filename.endswith(".txt"):
            with open(os.path.join(root, filename), encoding="utf-8") as file:
                name = filename.split(".")[0]
                date = filename.replace("assessment-", "").replace(".txt", "")
                text = file.read()
                row_df = pd.DataFrame({"Name": [name], "Date": [date], "Text": [text]})
                df_list.append(row_df)
df = pd.concat(df_list, ignore_index=True)
print("Successfully read the input data")

### TF-IDF creation

In [None]:
print("Find tokens")
filteredDf = df["Text"].apply(lambda d: " ".join(do_preprocessing(d)))

# To be uncommented if you want to see the most common words
# print("Find most common words")
# all_words = []
# for tokens in filteredDf:
#     for word in tokens.split(" "):
#         all_words.append(word)
# all_words = nltk.FreqDist(all_words)
# print("Top 30 frequenty used words: ")
# print(all_words.most_common(30))

frequent_words = {
    "russian",
    "force",
    "forces",
    "ukrainian",
    "ukraine",
    "oblast",
    "military",
    "reported",
    "effort",
    "likely",
    "claimed",
    "russia",
    "area",
    "operation",
    "continued",
    "city",
    "general",
    "near",
    "attack",
    "official",
    "staff",
    "also",
    "stated",
    "source",
    "oblast",
    "pm",
    "am",
    "january",
    "february",
    "march",
    "april",
    "may",
    "june",
    "july",
    "august",
    "september",
    "october",
    "november",
    "december",
}
filteredDf = filteredDf.apply(
    lambda d: " ".join(w for w in d.split() if w not in frequent_words)
)
df["Tokens"] = filteredDf

filenames = df["Name"]
dates = df["Date"]

print("Create vectors")
tfidf = TfidfVectorizer(smooth_idf=True, use_idf=True)
vectors = tfidf.fit_transform(df["Tokens"])

# store content
with open("results/tfidf.pkl", "wb") as handle:
    pickle.dump(tfidf, handle)


feature_names = tfidf.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
dictionaries = df.to_dict(orient="records")

res = __builtins__.zip(filenames, dates, dictionaries)
res_df = pd.DataFrame(res, columns=["Name", "Date", "Keywords"])
res_df["Keywords"] = res_df["Keywords"].apply(
    lambda d: {k: v for k, v in d.items() if v > 0}
)
res_df["Keywords"] = res_df["Keywords"].apply(
    lambda d: dict(sorted(d.items(), key=lambda item: item[1], reverse=True))
)

res_df.to_csv("results/tfidf.csv", index=False)
print("Successfully written to .csv")