# Pre-processing

In this notebook, we will pre-process the data to prepare it for the model. This will greatly reduce the amount of time needed to parse the collection for the IR systems.

---

## Import Required Libraries

Import the necessary libraries, including pandas and matplotlib.


In [1]:
from textprocessor import ReferenceRearrangedTextProcessor, ReferenceTextProcessor
from textprocessor import CustomTextProcessorNoStopNoStem
from textprocessor import CustomTextProcessorNoStem
from textprocessor import CustomTextProcessorNoStop
from textprocessor import CustomTextProcessor

import xml.etree.ElementTree as ET
from tqdm import tqdm
import zipfile
import re
import os

In [3]:
processor = CustomTextProcessorNoStopNoStem()

# XML-Coll-withSem
collection_name = "XML-Coll-withSem"
processed_dir = (
    "../lib/processed_data/"
    + collection_name
    + "_"
    + processor.get_text_processor_name()
    + "/"
)

os.makedirs(processed_dir, exist_ok=True)

In [4]:
zip_file_path = "../lib/data/practice_05/" + collection_name + ".zip"

We define a function to retrieve the text between the xml tags and then we process the text using our `TextProcessor` class. The pre-processed text is then put back to recreate the xml file.


In [5]:
def process_text_in_tags(element):
    if element.text:
        processed_text = " ".join(processor.pre_processing(element.text))
        element.text = processed_text

    if element.tail:  # Process the tail (text after the closing tag)
        processed_tail = " ".join(processor.pre_processing(element.tail))
        element.tail = processed_tail

    for child in element:
        process_text_in_tags(child)

In [6]:
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    xml_file_name = zip_ref.namelist()
    for file in tqdm(xml_file_name, desc="Processing files"):
        with zip_ref.open(file) as xml_file:
            xml_content = xml_file.read().decode("utf-8")

            xml_content = re.sub(r"<\/?b>", "", xml_content)
            xml_content = re.sub(r"<\/?it>", "", xml_content)

            root = ET.fromstring(re.sub("&[^;]+;", " ", xml_content))
            process_text_in_tags(root)

            # Convert the processed XML content to a string
            processed_xml = ET.tostring(root, encoding="unicode")
            processed_xml_file_path = (
                f"{processed_dir}{file.split('/')[1].split('.')[0]}.xml"
            )

            # Save the processed XML content to a new XML file
            with open(processed_xml_file_path, "w", encoding="utf-8") as processed_file:
                processed_file.write(processed_xml)

Processing files: 100%|██████████| 9804/9804 [03:06<00:00, 52.53it/s]


# A small analysis

We want to check what are the differences between the new pre-processing and the old one.


In [None]:
def get_text_content(element):
    # Start with an empty string
    text_content = ""
    # Loop through each child element
    for child in element:
        # If the child is a sub-element, recursively get its text content
        if len(child) > 0:
            text_content += get_text_content(child)
        # Append the text of the current element
        if child.text:
            text_content += child.text
        # Append any tail text after the element
        if child.tail:
            text_content += child.tail
    return text_content

In [None]:
with open("../lib/data/practice_05/medium/54772.xml", "rb") as file:
    content = file.read().decode("utf-8")
    root = ET.fromstring(re.sub("&[^;]+;", " ", content))
    content = " ".join(root.itertext())
    # content = get_text_content(root)
    content = processor.pre_processing(content)
    with open("./original/54772-2.txt", "w", encoding="utf-8") as f:
        f.write(" ".join(content))


with open("../lib/processed_data/medium_stop671_porter/54772.xml", "rb") as file:
    xml_content = file.read().decode("utf-8")
    root = ET.fromstring(xml_content)
    xml_content = " ".join(root.itertext())

with open("../lib/data/practice_05/medium/54772.xml", "rb") as file:
    content = file.read().decode("utf-8")

    # remove all the xml tags
    content = re.sub("<[^>]*>", "", content)
    content = re.sub("&[^;]+;", " ", content)
    text = processor.pre_processing(content)

    with open("./original/54772.txt", "w") as f:
        f.write(" ".join(text))

In [None]:
diff("./original/54772.txt", xml_content)

In [None]:
# read xml files
all_stats_df = pd.DataFrame()
# Read xml files and compare with text files
with zipfile.ZipFile("../lib/processed_data/medium_stop671_porter.zip", "r") as zip_ref:
    xml_file_name = zip_ref.namelist()
    for file in tqdm(xml_file_name, desc="Processing files"):
        with zip_ref.open(file) as xml_file:
            xml_content = xml_file.read().decode("utf-8")
            root = ET.fromstring(xml_content)
            xml_content = " ".join(root.itertext())

            # Generate differences and store in DataFrame
            df = diff(f"./original/{file.split('/')[1].split('.')[0]}.txt", xml_content)

            # Extract 'docno' from the file name
            docno = file.split("/")[1].split(".")[0]

            # Add 'docno' as a column
            df["docno"] = docno

            # Append the DataFrame to the main DataFrame
            all_stats_df = pd.concat([all_stats_df, df], ignore_index=True)

# Reorder columns with 'docno' as the first column
all_stats_df = all_stats_df[
    ["docno"] + [col for col in all_stats_df.columns if col != "docno"]
]

In [None]:
all_stats_df = all_stats_df[all_stats_df["Operation"] != ""]
all_stats_df.head()

In [None]:
# search for changement in 153299 for file 1
all_stats_df["File 1"] = "px"
all_stats_df[all_stats_df["docno"] == "153299"]

In [None]:
from collections import Counter

# Create Counters for inserted, deleted, and replaced words
inserted_words_counter = Counter()
deleted_words_counter = Counter()
replaced_words_counter = Counter()

# Process rows in the DataFrame to count words based on operation type
for index, row in all_stats_df.iterrows():
    operation = row["Operation"]
    file1_words = row["File 1"].split() if isinstance(row["File 1"], str) else []
    file2_words = row["File 2"].split() if isinstance(row["File 2"], str) else []

    # Increment word counts based on operation type
    if operation == "Inserted":
        inserted_words_counter.update(file2_words)
    elif operation == "Deleted":
        deleted_words_counter.update(file1_words)
    elif operation == "Replaced":
        # Count words from both file1 and file2 as replaced words
        replaced_words_counter.update(file1_words + file2_words)

# Display the most common deleted, inserted, and replaced words
print("Most common words deleted:")
print(deleted_words_counter.most_common(10))

print("\nMost common words inserted:")
print(inserted_words_counter.most_common(10))

print("\nMost common words replaced:")
print(replaced_words_counter.most_common(10))

In [None]:
# total words removed
total_words_removed = sum(deleted_words_counter.values())
print(f"Total words removed: {total_words_removed}")

# total words inserted
total_words_inserted = sum(inserted_words_counter.values())
print(f"Total words inserted: {total_words_inserted}")

In [None]:
# frequency of all the removed words
removed_words_freq = deleted_words_counter.items()
# sort the words by frequency
removed_words_freq = sorted(removed_words_freq, key=lambda x: x[1], reverse=True)
print(f"Frequency of all removed words: {removed_words_freq}")

# sum of the frequency of the removed words
removed_words_freq_sum = sum([freq for _, freq in removed_words_freq])
print(f"Sum of the frequency of the removed words: {removed_words_freq_sum}")

In [None]:
import matplotlib.pyplot as plt

# Get the removed words and their frequencies
removed_words, removed_counts = zip(*removed_words_freq)

# Plotting the frequency of removed words
plt.figure(figsize=(12, 6))
plt.bar(removed_words[:20], removed_counts[:20], color="red")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Frequency of Removed Words (Top 20)")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()