In [1]:
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import os
# Library for plotting data
import matplotlib.pyplot as plt

In [2]:
# Set corpus to the folder of files you want to use
corpus = '/home/ec2-user/SageMaker/data/2022-10-30-NYTALL20112018/'

# Read in files
input_files = os.listdir(corpus)

In [3]:
# Function to strip html tags from text portion
def strip_html_tags(text):
    stripped = BeautifulSoup(text).get_text().replace('\n', ' ').replace('\\', '').strip()
    return stripped

In [4]:
def getxmlcontent(corpus, file, strip_html=True):
    try:
        tree = etree.parse(corpus + file)
        root = tree.getroot()

        if root.find('.//GOID') is not None:
            goid = root.find('.//GOID').text
        else:
            goid = None

        if root.find('.//Title') is not None:
            title = root.find('.//Title').text
        else:
            title = None

        if root.find('.//NumericDate') is not None:
            date = root.find('.//NumericDate').text
        else:
            date = None
            
        if root.find('.//PublisherName') is not None:
            publisher = root.find('.//PublisherName').text
        else:
            publisher = None

        if root.find('.//FullText') is not None:
            text = root.find('.//FullText').text

        elif root.find('.//HiddenText') is not None:
            text = root.find('.//HiddenText').text

        elif root.find('.//Text') is not None:
            text = root.find('.//Text').text

        else:
            text = None

        # Strip html from text portion
        if text is not None and strip_html == True:
            text = strip_html_tags(text)
    
    except Exception as e:
        print(f"Error while parsing file {file}: {e}")
    
    return goid, title, date, publisher, text

In [5]:
# Columns lists
goid_list = []
publisher_list = []
text_list = []
date_list = []

# Used for grouping by publisher
publishers = []

for file in input_files:
    
    goid, title, date, publisher, text = getxmlcontent(corpus, file, strip_html=True)
    
    goid_list.append(goid)
    publisher_list.append(publisher)
    text_list.append(text)
    date_list.append(date)

In [6]:
# Transform processed data into a dataframe
df = pd.DataFrame({'GOID': goid_list, 'Publisher': publisher_list, 'Text': text_list, 'Date': date_list})

In [7]:
new_df = df[['Text', 'Date']]

In [8]:
new_df = new_df.dropna()
x = new_df['Text']
x1 = new_df['Date']

In [9]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
transformer = CountVectorizer(strip_accents='ascii', binary=True, lowercase=True, stop_words='english')

In [10]:
transform_ma_sparse = transformer.fit_transform(x)

In [11]:
transform_df = pd.DataFrame.sparse.from_spmatrix(transform_ma_sparse, columns = transformer.get_feature_names())

In [12]:
transform_df.insert(0, 'Date', x1)
transform_df = transform_df.set_index('Date')

In [16]:
transform_df1 = transform_df.filter(regex=r'^econom[iy]').reset_index()

In [17]:
transform_df1.to_csv('/home/ec2-user/SageMaker/data/test.csv')

In [19]:
transform_df1.head(3)

Unnamed: 0,Date,economi,economia,economic,economica,economicac,economical,economically,economicand,economicas,...,economistshavelongadvocatedthisasawaytoensure,economix,economize,economized,economizers,economizes,economizing,economy,economymartino,economywide
0,2017-02-05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2015-04-11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2015-05-20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
