## Preprocessing

In [2]:
import tarfile
import csv
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from tqdm import tqdm
import json

# Download necessary NLTK resources
nltk.download('stopwords')

# Define helper functions
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters (keeping ASCII letters and spaces)
    text = re.sub(r"[^a-z\s]", " ", text)
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Apply stemming
    stemmer = SnowballStemmer("english")
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

# File paths
tar_path = "/home/elahe/Desktop/UniPisa/IR/TinTinfy/collection.tar.gz"  # Update with your file path
output_file = "preprocessed_data.json"

# Extract the tar.gz file
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall()

# Process the extracted file
input_file = "collection.tsv"  # The extracted file name
preprocessed_data = []

with open(input_file, "r", encoding="utf-8") as file:
    reader = csv.reader(file, delimiter="\t")
    for line in tqdm(reader, desc="Processing Documents"):
        try:
            # Ensure line has both pid and text
            if len(line) != 2:
                continue
            pid, text = line
            # Preprocess the text
            cleaned_text = preprocess_text(text)
            # Skip empty or too short documents
            if len(cleaned_text.split()) < 5:
                continue
            # Append to result
            preprocessed_data.append({"pid": pid, "text": cleaned_text})
        except Exception as e:
            # Log and skip malformed lines
            print(f"Error processing line: {line} - {e}")

# Save preprocessed data
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(preprocessed_data, outfile, ensure_ascii=False, indent=4)

print(f"Preprocessed data saved to {output_file}")


[nltk_data] Downloading package stopwords to /home/elahe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Processing Documents: 8841823it [1:10:51, 2079.65it/s]


Preprocessed data saved to preprocessed_data.json
