In [6]:
def import_dataset(filepath):

    """
    Parses the Cranfield dataset from the given file.

    This function reads the file, identifies the documents based on the '.I' marker,
    and extracts text from the '.T' (Title) and '.W' (Words) fields.
    
    Returns:
         A list of strings, where each string is the raw, unprocessed text of a document.
    """

    # Initialize an empty list to hold the raw text of all documents.
    documents_raw = []

    # A variable to hold the text of the current document being processed.
    current_text = ""
    
    # A boolean flag to track if the current line is part of a text field (.T or .W).
    is_text_section = False

    with open(filepath, 'r') as f:
        # Iterate through each line in the file.
        for line in f:
            # A line starting with '.I' marks the beginning of a new document.
            if line.startswith('.I'):
                # If 'current_text' is not empty, it means we have finished reading a document.
                if current_text:
                    # Append the complete text of the previous document to our list.
                    documents_raw.append(current_text.strip())
                
                # Reset 'current_text' to start for the new document.
                current_text = ""
                # Reset the flag, as we don't know what the next section will be.
                is_text_section = False

            # If a line starts with '.T' or '.W', it's a section we want to capture.
            elif line.startswith(('.T', '.W')):
                # Set our flag to True to start accumulating text from this and subsequent lines.
                is_text_section = True
            
            # If a line starts with '.A' or '.B', it's metadata we want to ignore.
            elif line.startswith(('.A', '.B')):
                # Set our flag to False to stop accumulating text until we see a new .T or .W.
                is_text_section = False
                
            # If the line doesn't start with a marker AND our flag is True...
            elif is_text_section:
                # ...it's a continuation of a title or abstract, so append it.
                # We add a space to ensure words from different lines are not merged together.
                current_text += line.strip() + " "

    # After the loop finishes, the last document's text is still held in 'current_text'.
    # This final check ensures the very last document in the file is added to the list.
    if current_text:
        documents_raw.append(current_text.strip())

    # A confirmation message for loading
    print(f"Successfully loaded {len(documents_raw)} raw documents.")
    
    # Return the final list
    return documents_raw

In [7]:
articles = import_dataset('./Dataset/cran.all.1400')

Successfully loaded 1398 raw documents.
