### Second Script - Random Sampling

This script takes 3 parameters: an arbitraty percentage, raw data file path and output data file path. The parameters are located at cells 2, 13 and Z. 

This script generates a folder that contains copies of articles that are selected from raw data directory, based on the chosen random arbitraty percentage. 

Example: Script returns a folder of 10 randomly selected articles in PDF format based on a raw data folder that contains 100 articles and a randomly chosen number as 0.1. 

In [27]:
# Libraries for parsing data
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import random

# Library for plotting data
import matplotlib.pyplot as plt

In [28]:
# Set corpus to the folder of files you want to use
corpus = '/home/ec2-user/SageMaker/data/2023-06-18-NEWSAMPLE/'

# Read in files
input_files = os.listdir(corpus)

In [29]:
# Function to strip html tags from text portion
def strip_html_tags(text):
    stripped = BeautifulSoup(text).get_text().replace('\n', ' ').replace('\\', '').strip()
    return stripped

In [30]:
def getxmlcontent(corpus, file, strip_html=True):
    try:
        tree = etree.parse(corpus + file)
        root = tree.getroot()

        if root.find('.//GOID') is not None:
            goid = root.find('.//GOID').text
        else:
            goid = None

        if root.find('.//Title') is not None:
            title = root.find('.//Title').text
        else:
            title = None

        if root.find('.//NumericDate') is not None:
            date = root.find('.//NumericDate').text
        else:
            date = None
            
        if root.find('.//PublisherName') is not None:
            publisher = root.find('.//PublisherName').text
        else:
            publisher = None

        if root.find('.//FullText') is not None:
            text = root.find('.//FullText').text

        elif root.find('.//HiddenText') is not None:
            text = root.find('.//HiddenText').text

        elif root.find('.//Text') is not None:
            text = root.find('.//Text').text

        else:
            text = None

        # Strip html from text portion
        if text is not None and strip_html == True:
            text = strip_html_tags(text)
    
    except Exception as e:
        print(f"Error while parsing file {file}: {e}")
    
    return goid, title, date, publisher, text

In [31]:
# Columns lists
goid_list = []
title_list = []
publisher_list = []
text_list = []
date_list = []

# Used for grouping by publisher
publishers = []

for file in input_files:
    try:
        goid, title, date, publisher, text = getxmlcontent(corpus, file, strip_html=True)
        #print(title)

        goid_list.append(goid)
        title_list.append(title)
        publisher_list.append(publisher)
        text_list.append(text)
        date_list.append(date)
    except:
        pass

Error while parsing file .ipynb_checkpoints: Document is empty, line 1, column 1 (.ipynb_checkpoints, line 1)


In [32]:
# Transform processed data into a dataframe
df = pd.DataFrame({'GOID': goid_list, 'Title': title_list, 'Publisher': publisher_list, 'Text': text_list, 'Date': date_list})

In [33]:
df

Unnamed: 0,GOID,Title,Publisher,Text,Date
0,108877475,DIARY OF A DROPOUT,New York Times Company,DIARY OF A DROPOUT BY SENATOR TIM WIRTH Ab...,1992-08-09
1,92148067,Foreign Affairs,New York Times Company,Foreign Affairs Drilling In the Cathedral ...,2001-03-02
2,91654309,North Korea Finds Softer Stances in the West,New York Times Company,North Korea Finds Softer Stances in the West...,2000-07-30
3,114340476,OLDSMOBILE AIMS AT FUEL ECONOMY,New York Times Company,OLDSMOBILE AIMS AT FUEL ECONOMY Low-Priced '...,1957-10-02
4,122154711,Turbo Called Thunderbird,New York Times Company,ABOUT CARS ABOUT CARS Marshall Schuon Tu...,1983-04-10
...,...,...,...,...,...
5541,92803105,What's New Under the Sun This Summer,New York Times Company,What's New Under the Sun This Summer A YOUNG...,2004-05-30
5542,123899090,Getting to Florida: A Choice of Ways,New York Times Company,Getting to Florida: A Choice of Ways By PAUL...,1979-12-09
5543,109919062,Auto Pollution Plan Is Drawn Into a Vortex,New York Times Company,Auto Pollution Plan Is Drawn Into a Vortex E...,1999-05-16
5544,111130823,U.S. Is Faced by Three Lawsuits To Restore Tig...,New York Times Company,U.S. Is Faced by Three Lawsuits To Restore Tig...,1985-11-19


In [34]:
# Random generated table based on the arbitraty percentage
nb_rows = len(df.index)

# You may please enter the arbitrary percentage as a number below
random_percentage = 0.001

# set seed
random.seed(123)

nb_random_articles = int(nb_rows * random_percentage)
nb_random_articles = 200
random_vector = pd.DataFrame(np.random.choice(range(nb_rows), nb_random_articles, replace=False), columns = ['random number'])

random_vector = random_vector.set_index('random number')
new_df = df.copy()
new_df = new_df[new_df.index.isin(random_vector.index)]
new_df = new_df.reset_index()
                

In [35]:
new_df

Unnamed: 0,index,GOID,Title,Publisher,Text,Date
0,24,92755506,"At the Beijing Auto Show, Signs of a Behemoth ...",New York Times Company,OVERSEAS OVERSEAS At the Beijing Auto Show...,2004-10-27
1,53,92200666,Where the Chitty Chitty Meets the Bang Bang,New York Times Company,DRIVING DI F' i V il i Where the Chitty Ch...,2002-08-02
2,96,1467505227,BEHIND THE WHEEL/2010 Subaru Legacy and Outback,New York Times Company,BEHIND THE WHEEL 2010 Subaru Legacy and Outbac...,2010-01-24
3,102,1634248838,Heavy Trucks to Be Subject to New Rules for Mi...,New York Times Company,Heavy Trucks to Be Subject to New Rules for Mi...,2011-08-10
4,169,848119572,Living the Hydrogen Life,New York Times Company,Living the Hydrogen Life By TORI TELLEM Lo...,2007-12-09
...,...,...,...,...,...,...
195,5479,122389814,"From Chrysler, a Sporty New Turbo",New York Times Company,ABOUT CARS ABOUT CARS Marshall Schuon Fr...,1984-01-01
196,5497,2463131032,Trump Is Hurting Regular People,New York Times Company,Trump Is Hurting Regular People Ralph Nader ...,2017-10-24
197,5526,120334343,SENATE VOTES BILL TO KEEP OIL CURBS,New York Times Company,SENATE VOTES BILL TO KEEP OIL CURBS Ford App...,1975-07-16
198,5535,120817519,BUSINESS Digest,New York Times Company,BUSINESS Digest Energy Leaders of seven ...,1979-06-30


In [36]:
# Export articles to html
html = []

for i in range(nb_random_articles):

    title_text = new_df.iloc[i, 2]
    text = new_df.iloc[i ,4]
    date = new_df.iloc[i, 5]
    
    # 2. Combine them together using a long f-string
    html.append(f'''
                    <html>
                        <body>
                            <h1>{title_text}</h1>
                            <h2>{date}</h2>
                            <p>{text}</p>
                        </body>
                    </html>
                    ''')
html_output = '\t'.join(html)
# 3. Write the html string as an HTML file
with open('/home/ec2-user/SageMaker/data/2023-06-18-export_newsample/2023-06-22-sample1.html', 'w') as f:
    f.write(html_output)

In [None]:
# Libraries for parsing data
import pandas as pd
from io import StringIO
from html.parser import HTMLParser
import os
import spacy
from lxml import etree
import itertools
from itertools import repeat

# Libraries for importing our sentiment analysis models
import pickle
import sklearn
from sklearn import preprocessing
from sentence_transformers import SentenceTransformer as ST

# Libraries for Multiprocessing
import multiprocessing as mp
from multiprocessing import Pool
from time import perf_counter
import numpy as np

In [None]:
# Extract the encodings that we will use
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encodings=["love", "anger", "disgust", "fear", "happiness", "sadness", "surprise", "neutral", "other"]
label_encodings.sort()
label_encoder.fit(label_encodings)

# Print out the emotions to check it has been loaded successfully
emotions = label_encoder.classes_
print(emotions)

In [None]:
# Set classifier_path to location of the logistic regression classifier
classifier_path = '/home/ec2-user/SageMaker/Getting Started/2022.05.25/Resources/Models/nli-mpnet-base-v2-LR-classifier.pkl'

try: 
    # Load and store model in sentiment_model
    file = open(classifier_path, 'rb')
    sentiment_model = pickle.load(file)
    
except Exception as e:
    print(f"Error while opening file: {e}")
    
finally:
    file.close()

# Set scaler_path to location of the pre-fit scaler
scaler_path = '/home/ec2-user/SageMaker/Getting Started/2022.05.25/Resources/Models/sentimentScaler.pkl'

try: 
    # Load and store model in scaler
    file = open(scaler_path, 'rb')
    scaler = pickle.load(file)
    
except Exception as e:
    print(f"Error while opening file: {e}")
    
finally:
    file.close()

In [None]:
# Set sbert_path to location of SBERT model
sbert_path = '/home/ec2-user/SageMaker/Getting Started/2022.05.25/Resources/Models/nli-mpnet-base-v2'
transformer = ST(sbert_path)

In [None]:
# Encoding through multiprocessing
def encode_sentence(sent):
    
    # Encode chunk of sentences in parsed_sents array
    sentence_embedding = transformer.encode(sent, show_progress_bar=False)
    
    return sentence_embedding

In [None]:
# When using multiple processes, important to eventually close them to avoid memory/resource leaks
try:
    start = perf_counter()

    # Define a thread Pool to process multiple sentences simultaneously
    # Default set to num_cores, but may change number of processes depending on instance
    cores_used = num_cores - 1
    p_encode = Pool(processes=cores_used)
    
    # Apply function with Pool to array
    chunksize = int(len(parsed_sents) / cores_used)
    sentence_embeddings = p_encode.map(encode_sentence, parsed_sents, chunksize)
    
    end = perf_counter()
    
    total_minutes = (end - start) / 60
    total_seconds = (end - start) % 60

    print(f"Took {int(total_minutes)}min {total_seconds:.2f}s to encode {len(parsed_sents)} sentences.")

except Exception as e:
    print(f"Error occurred while encoding sentences: {e}")

finally:
    p_encode.close()