In [1]:
import os
import tarfile
import xml.etree.ElementTree as ET
import re
import nltk

In [2]:
# Function to format the publication date
def format_publication_date(publication_date):
    parts = publication_date.split('-') 
    if len(parts) == 3:  # If the date has year, month, and day parts
        year, month, day = parts 
        return f"{int(day)}\t{int(month)}\t{year}"  
    else:
        print(f"Invalid publication date: {publication_date}")  
        return  # Return nothing (None) when the date is invalid

In [3]:
# Function to retrieve the publication date from the XML tree
def get_publication_date(tree):
    root = tree.getroot()  # Get the root element of the XML
    publication_date = ''  
    day = None  
    month = None
    year = None

    # Iterate over the 'meta' elements in the XML tree
    for meta in root.iter('meta'):
        if meta.get('name') == 'publication_day_of_month':  
            day = meta.get('content')  
        elif meta.get('name') == 'publication_month': 
            month = meta.get('content')  
        elif meta.get('name') == 'publication_year': 
            year = meta.get('content')  
    
    if day is not None and month is not None and year is not None:
        publication_date = f"{int(day)}\t{int(month)}\t{year}"    # Format the publication date as "year-month-day"
    
    return publication_date 

In [4]:
# Function to create a collection TSV file from XML files in a folder
def create_reformatting_mapping_files(folder_path, collection_path, mapping_path):
    row_id = 0 
    with open(collection_path, 'w') as collection_file, open(mapping_path, 'w') as mapping_file:  # Open the collection TSV file in write mode and Open the collection TSV file in write mode
        for root, _, files in os.walk(folder_path):  # Iterate over the files in the folder (including subfolders)
            for file in files:
                if file.endswith('.tgz'):  
                    tgz_path = os.path.join(root, file)  # Get the full path to the .tgz file
                    with tarfile.open(tgz_path, 'r:gz') as tar:  
                        for member in tar.getmembers():  
                            if member.isfile() and member.name.endswith('.xml'):  # If the member is a file with .xml extension
                                xml_file = tar.extractfile(member)  # Extract the XML file from the .tgz arch
                                paragraphs, publication_date = extract_paragraphs_from_xml(xml_file, member.name)  # Extract paragraphs from the XML file
                                member.name = member.name.split(".")[0]
                                member.name = member.name.split("/")[-1]
                                for paragraph in paragraphs:  # Iterate over the extracted paragraphs
                                    collection_file.write(f"{row_id}\t{paragraph}\n")  # Write the row ID and paragraph to the collection file
                                    mapping_file.write(f"{row_id}\t{member.name}\t{publication_date}\n")
                                    row_id += 1 

In [5]:
def split_paragraph(paragraph, n):
    sentences = nltk.tokenize.sent_tokenize(paragraph)
    return [" ".join(sentences[i:i+n]) for i in range(len(sentences)-(n-1))]

In [6]:
# Function to extract paragraphs from an XML file and print relevant information
def extract_paragraphs_from_xml(xml_path, file_name):
    tree = ET.parse(xml_path)  # Parse the XML file
    root = tree.getroot()  # Get the root element of the XML

    paragraphs = []
    publication_date = get_publication_date(tree)  # Extract the publication date from the XML
    reformatted_publication_date = publication_date.replace('\t', '.')

    # Extract the title as a paragraph (if it exists)
    try:
        title = root.find('.//hedline/hl1').text
        if title:
            paragraphs.append(title.strip())
    except AttributeError:
        pass

    # Extract paragraphs from the XML
    for paragraph in root.iter('p'):
        if paragraph.text:
            paragraph_text = paragraph.text.strip()
            paragraph_text = re.sub(r'\n+', ' ', paragraph_text)  # Replace multiple newline characters with a single newline
            paragraph_text = re.sub(r'\n\s*\n', ' ', paragraph_text)  # Remove empty lines
            paragraph_text = re.sub(r'\r', ' ', paragraph_text)  # NEW: Replace carriage return characters with a space

            # Check if the number of tokens in the paragraph is more than 200
            if len(paragraph_text.split()) > 200:
                # If yes, split the paragraph into smaller paragraphs
                split_paragraphs = split_paragraph(paragraph_text, 3)  # Here '3' is the number of sentences you want in each smaller paragraph
                paragraphs.extend(split_paragraphs)
            else:
                paragraphs.append(paragraph_text)

            # Format the paragraph text for display
            if len(paragraph_text.split()) > 7:
                strings = paragraph_text.split()[:7]
                formatted_text = ' '.join(strings) + ' [...]'
            else:
                formatted_text = paragraph_text

    return paragraphs, publication_date

In [9]:
folder_path = '1234'  # Specify the folder path containing the XML files
collection_path = f'{folder_path}_collection.tsv'  # Specify the path for the collection TSV file
mapping_path = f'{folder_path}_row_id_to_doc_id_mapping.tsv'  # Specify the path for the mapping TSV file

# Create the collection.tsv file
create_reformatting_mapping_files(folder_path, collection_path, mapping_path)

# Create the row_id_to_doc_id_mapping.tsv file
# create_row_id_to_doc_id_mapping(folder_path, mapping_path)