In [6]:
# Libraries for parsing data
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import os

# Library for plotting data
import matplotlib.pyplot as plt

In [7]:
# Set corpus to the folder of files you want to use
corpus = '/home/ec2-user/SageMaker/data/2022-12-18NYT20052015/'

# Read in files
input_files = os.listdir(corpus)

In [8]:
# Function to strip html tags from text portion
def strip_html_tags(text):
    stripped = BeautifulSoup(text).get_text().replace('\n', ' ').replace('\\', '').strip()
    return stripped

In [9]:
def getxmlcontent(corpus, file, strip_html=True):
    try:
        tree = etree.parse(corpus + file)
        root = tree.getroot()

        if root.find('.//GOID') is not None:
            goid = root.find('.//GOID').text
        else:
            goid = None

        if root.find('.//Title') is not None:
            title = root.find('.//Title').text
        else:
            title = None

        if root.find('.//NumericDate') is not None:
            date = root.find('.//NumericDate').text
        else:
            date = None
            
        if root.find('.//PublisherName') is not None:
            publisher = root.find('.//PublisherName').text
        else:
            publisher = None

        if root.find('.//FullText') is not None:
            text = root.find('.//FullText').text

        elif root.find('.//HiddenText') is not None:
            text = root.find('.//HiddenText').text

        elif root.find('.//Text') is not None:
            text = root.find('.//Text').text

        else:
            text = None

        # Strip html from text portion
        if text is not None and strip_html == True:
            text = strip_html_tags(text)
    
    except Exception as e:
        print(f"Error while parsing file {file}: {e}")
    
    return goid, title, date, publisher, text

In [None]:
# Columns lists
goid_list = []
publisher_list = []
text_list = []
date_list = []

# Used for grouping by publisher
publishers = []

for file in input_files:
    
    goid, title, date, publisher, text = getxmlcontent(corpus, file, strip_html=True)
    
    goid_list.append(goid)
    publisher_list.append(publisher)
    text_list.append(text)
    date_list.append(date)

In [None]:
# Transform processed data into a dataframe
df = pd.DataFrame({'GOID': goid_list, 'Publisher': publisher_list, 'Text': text_list, 'Date': date_list})

In [None]:
df.head(3)

In [None]:
df.to_csv('/home/ec2-user/SageMaker/data/test.csv')