Import dataset

In [1]:
def import_dataset(filepath):

    """
    Parses the Cranfield dataset from the given file.

    This function reads the file, identifies the documents based on the '.I' marker,
    and extracts text from the '.T' (Title) and '.W' (Words) fields.
    
    Returns:
         A list of strings, where each string is the raw, unprocessed text of a document.
    """

    # Initialize an empty list to hold the raw text of all documents.
    documents_raw = []

    # A variable to hold the text of the current document being processed.
    current_text = ""
    
    # A boolean flag to track if the current line is part of a text field (.T or .W).
    is_text_section = False

    with open(filepath, 'r') as f:
        # Iterate through each line in the file.
        for line in f:
            # A line starting with '.I' marks the beginning of a new document.
            if line.startswith('.I'):
                # If 'current_text' is not empty, it means we have finished reading a document.
                if current_text:
                    # Append the complete text of the previous document to our list.
                    documents_raw.append(current_text.strip())
                
                # Reset 'current_text' to start for the new document.
                current_text = ""
                # Reset the flag, as we don't know what the next section will be.
                is_text_section = False

            # If a line starts with '.T' or '.W', it's a section we want to capture.
            elif line.startswith(('.T', '.W')):
                # Set our flag to True to start accumulating text from this and subsequent lines.
                is_text_section = True
            
            # If a line starts with '.A' or '.B', it's metadata we want to ignore.
            elif line.startswith(('.A', '.B')):
                # Set our flag to False to stop accumulating text until we see a new .T or .W.
                is_text_section = False
                
            # If the line doesn't start with a marker AND our flag is True...
            elif is_text_section:
                # ...it's a continuation of a title or abstract, so append it.
                # We add a space to ensure words from different lines are not merged together.
                current_text += line.strip() + " "

    # After the loop finishes, the last document's text is still held in 'current_text'.
    # This final check ensures the very last document in the file is added to the list.
    if current_text:
        documents_raw.append(current_text.strip())

    # A confirmation message for loading
    print(f"Successfully loaded {len(documents_raw)} raw documents.")
    
    # Return the final list
    return documents_raw

In [9]:
articles = import_dataset('./Dataset/cran.all.1400')

Successfully loaded 1398 raw documents.


Preprocessing

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# You only need to run these lines once to download the necessary NLTK packages.
# try:
#     stopwords.words('english')
# except LookupError:
#     nltk.download('stopwords')
# try:
#     nltk.data.find('tokenizers/punkt')
# except LookupError:
#     nltk.download('punkt')
# try:
#     nltk.data.find('corpora/wordnet')
# except LookupError:
#     nltk.download('wordnet')
# -----------------------------------

In [11]:

def preprocess_text(raw_docs, method='lemmatize'):
    """
    Takes a list of raw document strings and applies all preprocessing steps.

    Args:
        raw_docs (list of str): The list of unprocessed document texts.
        method (str): The word reduction method to use. Can be 'lemmatize' (default)
                      or 'stem'.

    Returns:
        A list of lists, where each inner list contains the processed tokens
        of a single document.
    """
    # Initialize lists and objects for preprocessing.
    processed_docs = []
    
    # 1. TOKENIZATION AND NORMALIZATION (LOWERCASE, PUNCTUATION REMOVAL)
    # The tokenizer will split the document text into a list of words.
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    
    # 2. STOP WORD REMOVAL
    # Load the set of English stop words. Using a set provides fast lookups.
    stop_words = set(stopwords.words('english'))
    
    # 3. STEMMING / LEMMATIZATION
    # Initialize the chosen processor.
    if method == 'lemmatize':
        processor = WordNetLemmatizer()
        process_func = processor.lemmatize
    elif method == 'stem':
        processor = PorterStemmer()
        process_func = processor.stem
    else:
        raise ValueError("Method must be 'lemmatize' or 'stem'")

    # Process each document in the input list.
    for doc in raw_docs:
        # Lowercase the document text.
        doc = doc.lower()
        
        # Use the tokenizer to get a list of alphabetic tokens.
        tokens = tokenizer.tokenize(doc)
        
        # Filter out stop words from the token list.
        filtered_tokens = [token for token in tokens if token not in stop_words]
        
        # Apply the chosen processing (lemmatization or stemming) to each token.
        processed_tokens = [process_func(token) for token in filtered_tokens]
        
        # Add the final list of processed tokens to our main list.
        processed_docs.append(processed_tokens)
        
    print(f"Finished preprocessing all documents using the '{method}' method.")
    return processed_docs


In [13]:
processed_articles = preprocess_text(articles)
        
# --- Display a sample to verify the process ---
print("\n--- Verification Sample (Document #2) ---")
        
# Print the raw text of the second document (index 1)
print("\n[Raw Text]:")
print(articles[1])
        
# Print the same document after preprocessing
print("\n[Processed Tokens]:")
print(processed_articles[1])

Finished preprocessing all documents using the 'lemmatize' method.

--- Verification Sample (Document #2) ---

[Raw Text]:
simple shear flow past a flat plate in an incompressible fluid of small viscosity . simple shear flow past a flat plate in an incompressible fluid of small viscosity . in the study of high-speed viscous flow past a two-dimensional body it is usually necessary to consider a curved shock wave emitting from the nose or leading edge of the body .  consequently, there exists an inviscid rotational flow region between the shock wave and the boundary layer .  such a situation arises, for instance, in the study of the hypersonic viscous flow past a flat plate .  the situation is somewhat different from prandtl's classical boundary-layer problem . in prandtl's original problem the inviscid free stream outside the boundary layer is irrotational while in a hypersonic boundary-layer problem the inviscid free stream must be considered as rotational .  the possible effects of vo

Positional Index

In [14]:
from collections import defaultdict

def make_positional_index(processed_docs):
    """
    Builds a positional inverted index from the processed documents.

    The index is a dictionary where keys are terms. The value for each term
    is another dictionary, where keys are document IDs and values are lists
    of the positions where the term appears in that document.
    """
    print("\n--- Starting Indexing ---")
    
    # We use a defaultdict of dicts to easily create nested structures.
    # If a term is new, it will automatically be assigned an empty dictionary.
    p_index = defaultdict(dict)

    # Enumerate over the processed documents to get both the document ID (docid)
    # and the list of tokens for that document. The index acts as the docID.
    for docid, tokens in enumerate(processed_docs):
        # Enumerate over the tokens in the current document to get both the
        # position (pos) and the term itself.
        for pos, term in enumerate(tokens):
            # Check if the docid is already a key for this term.
            if docid in p_index[term]:
                # Append the new position to the existing list.
                p_index[term][docid].append(pos)
            else:
                # If the docid is not a key, this is the first time the term
                # Create a new list containing the current position.
                p_index[term][docid] = [pos]

    print(f"Finished indexing. The vocabulary contains {len(p_index)} unique terms.")
    return p_index

In [17]:
p_index = make_positional_index(processed_articles)

# --- Display a sample of the positional index ---

for i, (term, postings) in enumerate(p_index.items()):
    if i >= 2:  # Limit to first 2 terms for brevity
        break
    print(f"Term: '{term}'")
    for docid, positions in postings.items():
        print(f"  DocID: {docid}, Positions: {positions}")



--- Starting Indexing ---
Finished indexing. The vocabulary contains 6539 unique terms.
Term: 'experimental'
  DocID: 0, Positions: [0, 5, 10]
  DocID: 10, Positions: [55]
  DocID: 11, Positions: [34]
  DocID: 16, Positions: [36]
  DocID: 18, Positions: [34]
  DocID: 24, Positions: [92]
  DocID: 28, Positions: [141]
  DocID: 29, Positions: [29]
  DocID: 34, Positions: [67]
  DocID: 40, Positions: [34]
  DocID: 41, Positions: [118]
  DocID: 46, Positions: [121]
  DocID: 51, Positions: [10, 22]
  DocID: 52, Positions: [14]
  DocID: 57, Positions: [69]
  DocID: 68, Positions: [81]
  DocID: 69, Positions: [21]
  DocID: 73, Positions: [0, 8]
  DocID: 77, Positions: [84]
  DocID: 83, Positions: [0, 8, 19]
  DocID: 98, Positions: [96, 133]
  DocID: 100, Positions: [196]
  DocID: 102, Positions: [63]
  DocID: 111, Positions: [87]
  DocID: 114, Positions: [98]
  DocID: 120, Positions: [85]
  DocID: 122, Positions: [25, 51]
  DocID: 136, Positions: [11]
  DocID: 139, Positions: [107]
  DocID: 1