Records generated using ml.c5.18xlarge

In [2]:
import numpy as np
import pandas as pd
import scipy
import time
import ujson as json
import boto3
import s3fs
from multiprocessing import Process, Manager
from multiprocessing.pool import ThreadPool, Pool

LOW_PEACE = set(['Afghanistan', 'Congo', 'Guinea',
             'India', 'Iran', 'Kenya', 'Nigeria',
             'Sri Lanka', 'Uganda', 'Zimbabwe'])
HIGH_PEACE = set(['Austria', 'Australia', 'Belgium', 
             'Czech Republic', 'Denmark', 'Finland',
             'Netherlands', 'New Zealand', 'Norway', 'Sweden'])

# Sample file
# 'peaceClassify/data/highPeace/00399ef3b93fc5251d938d2023c7bce9.json'

## Load in both datasets

In [2]:
start_time = time.time()
hp_df = pd.read_csv('s3://compressed-data-sample/compressed1m_hp.csv', 
                          dtype={"wordCount": 'int16', "country":'category'})
end_time = time.time()
print(f'Loading Time: {end_time - start_time} seconds')
hp_df

Loading Time: 280.9251012802124 seconds


Unnamed: 0,title,content,wordCount,country
0,MONTHLY SECTOR REPORT: S&P/ASX EMERGING COMPAN...,AUSTRALIAN MONTHLY INDEX REPORT\n\nThe S&P/ASX...,2405,Australia
1,PAPRIKA CLUB RESTAURANT,THE Paprika Club Restaurant offers freshly pre...,120,Australia
2,Bone implants to be made from 3D printed salt,ABSTRACT\n\n\n\nResearchers have developed a n...,545,Australia
3,"Yes, you're entitled to a free portrait of the...",It's a truth universally acknowledged that a h...,598,Australia
4,Why is Hemsworth wearing nail polish?,If you thought nail polish was only for women ...,729,Australia
...,...,...,...,...
999995,Murder sentence cannot heal family's heartache,The man who murdered 90-year-old Springbank gr...,621,Australia
999996,Residents' satisfaction with Shellharbour coun...,Shellharbour citizens' satisfaction with their...,313,Australia
999997,Weekly: Macmahon Holdings climbs 4.9% on weak ...,AUSTRALIAN WEEKLY STOCK REPORT\n\nMacmahon Hol...,3108,Australia
999998,Stock Weekly: Valoe (VALOE:6.22c) in bottom 1%...,FINNISH WEEKLY STOCK REPORT\n\nValoe Oyj (HEL:...,17401,Finland


In [3]:
# Check if export correctly
start_time = time.time()
lp_df = pd.read_csv('s3://compressed-data-sample/compressed1m_lp.csv', 
                    dtype={"wordCount": 'int16', "country":'category'})
end_time = time.time()
print(f'Loading Time: {end_time - start_time} seconds')
lp_df

Loading Time: 150.52604365348816 seconds


Unnamed: 0,title,content,wordCount,country
0,"Atlas Cycles (Haryana) keeps sliding, down 10....",INDIAN DAILY STOCK REPORT\n\nAtlas Cycles (Har...,1555,India
1,Varun Dhawan shares quirky boomerang to wish S...,"Mumbai (Maharashtra) [India], August 12 (ANI):...",177,India
2,Traffic cops launch Road Courtesy campaign,Vadodara: After conducting an intense drive fo...,308,India
3,United States Courts Opinions: UNITED STATES D...,Washington: UNITED STATES DISTRICT COURT NORTH...,1993,India
4,OSCE Secretary General sympathizes with Iran q...,Tehran: The official news agency of Iran (IRNA...,259,Iran
...,...,...,...,...
999995,"Intel Awarded Patent for Systems, Methods and ...",FULL TEXT\n\n\n\nPublication Name: Software Pa...,241,India
999996,Morning Alert: Williamson Magor & Co keeps ris...,INDIAN INTRA-DAY STOCK REPORT\n\nWilliamson Ma...,680,India
999997,"LJUBLJANSKE MLEKARNE, mlekarska industrija, d....","Slovenia, Nov. 14 -- Slovenia based LJUBLJANSK...",68,India
999998,STEEL AUTHORITY OF INDIA LIMITED (SAIL) Provid...,"India ,April 10 -- Tender No. CCNW/PR11300076...",51,India


## Prepare preprocessing

In [4]:
import re
import requests

In [5]:
! pip install spacy
! python -m spacy download en_core_web_sm
import spacy

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 21.7 MB/s eta 0:00:01


You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
CONVERT_DICT_URL = "https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/british_spellings.json"
british_to_american_dict = requests.get(CONVERT_DICT_URL).json()
REMOVED_ENTITY_TYPE = set(['PERSON', 'GPE', 'ORG'])
SENT_TO_REMOVE = re.compile('For any query with respect to this article or any other content requirement, please contact Editor at')
special_pattern = re.compile(r'(http\S+)|(\S+.com)|(\S*@\S*\s?)|\(\S+ ?: ?\S+\)', flags=re.MULTILINE)
spacy_nlp = spacy.load('en_core_web_sm',  
                       exclude=['senter', 'entity_ruler','textcat', 'sentencizer', 'merge_noun_chunks', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])

In [7]:
british_pattern = '|'.join([r'\b'+word+r'\b' for word in british_to_american_dict.keys()])

def americanize(string):
    '''
    Convert all British English spelling to American spelling in the input string
    source: https://stackoverflow.com/questions/42329766/python-nlp-british-english-vs-american-english
    -- Modified the code from source to meet our special requirement here. 
    Input:
      sentence: a text string
    Return: 
      string: a string with all words in English spelling
    '''
    
    for british_word in set([i.lower() for i in re.findall(british_pattern, string, flags=re.IGNORECASE)]):
        string = re.sub(r'\b'+british_word+r'\b', british_to_american_dict[british_word], string, flags=re.IGNORECASE)
  
    return string


def preprocess(sentence):
    '''
    Preprocessing the raw text and prepare for model input
    Input:
      sentences: a series of text strings
    Return:
      sentences: A series of preprocessed sentence for BERT encoder
    '''
    # Step 1: URL and Email Removal
    # Remove the repeating phrase
    sentence = SENT_TO_REMOVE.sub(' ', sentence)
    
    # Remove website url and email address and Stock Symbols
    sentence = special_pattern.sub('', sentence)
    
    # Step 2: Convert British to American English
    sentence = americanize(sentence)
      
    # Step 3: Identify and Remove Named Entity
    # Use spacy to get a sequence of tokens for named entity removal
    sentence = ' '.join(['%s'%ent for ent in spacy_nlp(sentence) if ent.ent_type_ not in REMOVED_ENTITY_TYPE])
    
    # Step 4: Cleanup
    # Remove new line character
    sentences = re.sub('\n', ' ', sentence)
    # Remove single letter word since it doesn't make sense; might be some technical terms that are trimmed off
    sentence = re.sub('\ [a-zA-z]\ ', ' ', sentence, flags=re.IGNORECASE)
    
    # Remove redundant spaces and unnecessary spaces between punctuations and letters
    sentence = re.sub(' +', ' ', sentence)
    sentence  = re.sub(r'(\s([,.?!%"\']))|(?<=\[|\()(.*?)(?=\)|\])', lambda x: x.group().strip(), sentence)
    
    return sentence

## Start Parallel Computing

In [8]:
! pip install pandarallel
from pandarallel import pandarallel

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [9]:
pandarallel.initialize()

time_start = time.time()
lp_df['content_cleaned'] = lp_df.parallel_apply(lambda x: preprocess(x['title'] + '\n' + x['content']), axis=1)
time_end = time.time()

print(f'Preprocessing Time: {time_end - time_start} seconds')

INFO: Pandarallel will run on 72 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Preprocessing Time: 1124.8846995830536 seconds


In [13]:
# Quick check on the result
lp_df

Unnamed: 0,title,content,wordCount,country,content_cleaned
0,"Atlas Cycles (Haryana) keeps sliding, down 10....",INDIAN DAILY STOCK REPORT\n\nAtlas Cycles (Har...,1555,India,"(Haryana) keeps sliding, down 10.2% in 4 days ..."
1,Varun Dhawan shares quirky boomerang to wish S...,"Mumbai (Maharashtra) [India], August 12 (ANI):...",177,India,Varun Dhawan shares quirky to wish on birthday...
2,Traffic cops launch Road Courtesy campaign,Vadodara: After conducting an intense drive fo...,308,India,Traffic cops launch Road Courtesy campaign \n ...
3,United States Courts Opinions: UNITED STATES D...,Washington: UNITED STATES DISTRICT COURT NORTH...,1993,India,": : JOEL,, v., et al., Defendants \n : has iss..."
4,OSCE Secretary General sympathizes with Iran q...,Tehran: The official news agency of Iran (IRNA...,259,Iran,Secretary General sympathizes with quake victi...
...,...,...,...,...,...
999995,"Intel Awarded Patent for Systems, Methods and ...",FULL TEXT\n\n\n\nPublication Name: Software Pa...,241,India,for \n FULL TEXT \n\n\n\n Publication Name : S...
999996,Morning Alert: Williamson Magor & Co keeps ris...,INDIAN INTRA-DAY STOCK REPORT\n\nWilliamson Ma...,680,India,Morning Alert : keeps rising : up 22.0% in 6 d...
999997,"LJUBLJANSKE MLEKARNE, mlekarska industrija, d....","Slovenia, Nov. 14 -- Slovenia based LJUBLJANSK...",68,India,"LJUBLJANSKE MLEKARNE, industrija, d.o.o. bags ..."
999998,STEEL AUTHORITY OF INDIA LIMITED (SAIL) Provid...,"India ,April 10 -- Tender No. CCNW/PR11300076...",51,India,STEEL AUTHORITY OF INDIA LIMITED (SAIL) Provid...


In [11]:
pandarallel.initialize()

time_start = time.time()
hp_df['content_cleaned'] = hp_df.parallel_apply(lambda x: preprocess(x['title'] + '\n' + x['content']), axis=1)
time_end = time.time()

print(f'Preprocessing Time: {time_end - time_start} seconds')

INFO: Pandarallel will run on 72 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Preprocessing Time: 2308.610693216324 seconds


In [14]:
# Quick check on the result
hp_df

Unnamed: 0,title,content,wordCount,country,content_cleaned
0,MONTHLY SECTOR REPORT: S&P/ASX EMERGING COMPAN...,AUSTRALIAN MONTHLY INDEX REPORT\n\nThe S&P/ASX...,2405,Australia,"MONTHLY SECTOR REPORT : (), WITH 50 - DAY MA B..."
1,PAPRIKA CLUB RESTAURANT,THE Paprika Club Restaurant offers freshly pre...,120,Australia,"\n offers freshly prepared, delicious and arom..."
2,Bone implants to be made from 3D printed salt,ABSTRACT\n\n\n\nResearchers have developed a n...,545,Australia,Bone implants to be made from 3D printed salt ...
3,"Yes, you're entitled to a free portrait of the...",It's a truth universally acknowledged that a h...,598,Australia,"Yes, you're entitled to free portrait of the Q..."
4,Why is Hemsworth wearing nail polish?,If you thought nail polish was only for women ...,729,Australia,Why is wearing nail polish? \n If you thought ...
...,...,...,...,...,...
999995,Murder sentence cannot heal family's heartache,The man who murdered 90-year-old Springbank gr...,621,Australia,Murder sentence can not heal family's heartach...
999996,Residents' satisfaction with Shellharbour coun...,Shellharbour citizens' satisfaction with their...,313,Australia,Residents' satisfaction with Shellharbour coun...
999997,Weekly: Macmahon Holdings climbs 4.9% on weak ...,AUSTRALIAN WEEKLY STOCK REPORT\n\nMacmahon Hol...,3108,Australia,Weekly : Macmahon Holdings climbs 4.9% on weak...
999998,Stock Weekly: Valoe (VALOE:6.22c) in bottom 1%...,FINNISH WEEKLY STOCK REPORT\n\nValoe Oyj (HEL:...,17401,Finland,: Valoe in bottom 1% performers of Finnish mar...


## Perform Train-Test split

In [15]:
from sklearn.model_selection import train_test_split
s3_resource = boto3.resource('s3')

In [16]:
time_start = time.time()
train_peace, test_peace = train_test_split(hp_df, test_size=0.2)
train_nonpeace, test_nonpeace = train_test_split(lp_df, test_size=0.2)

train = pd.concat([train_peace, train_nonpeace], ignore_index=True)
test = pd.concat([test_peace, test_nonpeace], ignore_index=True)
time_end = time.time()

print(f'Spliting Time: {time_end - time_start} seconds')

Spliting Time: 4.499696731567383 seconds


In [17]:
train

Unnamed: 0,title,content,wordCount,country,content_cleaned
0,Australia: Ethicon Endo-Surgery Receives Paten...,"Australia, April 18 -- Ethicon Endo-Surgery In...",118,Australia,: Surgical instrument with modular end effecto...
1,Even basic data offers boost,BETTER use can be made of basic data already b...,295,Australia,Even basic data offers boost \n BETTER use can...
2,"May 05, 2020: Carrier Corporation issued paten...",Carrier Corporation has been issued a new U.S....,3444,Australia,"May 05, 2020 : issued patent titled"" Image sen..."
3,Green light for traffic safety projects,LIVE LOCAL. LOVE LOCAL\n\nWITH AUCKLAND COUNCI...,267,New Zealand,Green light for traffic safety projects \n LIV...
4,John Howard and Geoffrey Blainey to open Ramsa...,"Sydney, Wednesday 4 April 2018: Emeritus Profe...",479,Australia,"and to open center building \n, Wednesday 4 Ap..."
...,...,...,...,...,...
1599995,Oh Hyun Junior High School Classroom Repair Work,Contract awarded for Oh hyun junior high schoo...,114,India,Oh Hyun Junior High School Classroom Repair Wo...
1599996,Beach lifeguards spot marooned lovers kilometr...,MUMBAI: A young couple perched on the rocks by...,220,India,Beach lifeguards spot marooned lovers kilomete...
1599997,Racial Justice and Civil Rights Organizations ...,New York: American Civil Liberties Union has i...,468,India,Racial Justice and Civil Rights Organizations ...
1599998,Anhui Huiming Machinery Mfg Files Chinese Pate...,FULL TEXT\n\n\n\nPublication Name: Metal & Min...,340,India,Anhui Huiming Machinery Mfg Files Chinese Pate...


In [18]:
# Export Train
train.to_csv('s3://compressed-data-sample/processed_train.csv', index=False)

In [19]:
# Check if export correctly
start_time = time.time()
train_check = pd.read_csv('s3://compressed-data-sample/processed_train.csv', 
                          dtype={"wordCount": 'int16', "country":'category'})
end_time = time.time()
print(f'Loading Time: {end_time - start_time} seconds')
train_check

Loading Time: 643.808084487915 seconds


Unnamed: 0,title,content,wordCount,country,content_cleaned
0,Australia: Ethicon Endo-Surgery Receives Paten...,"Australia, April 18 -- Ethicon Endo-Surgery In...",118,Australia,: Surgical instrument with modular end effecto...
1,Even basic data offers boost,BETTER use can be made of basic data already b...,295,Australia,Even basic data offers boost \n BETTER use can...
2,"May 05, 2020: Carrier Corporation issued paten...",Carrier Corporation has been issued a new U.S....,3444,Australia,"May 05, 2020 : issued patent titled"" Image sen..."
3,Green light for traffic safety projects,LIVE LOCAL. LOVE LOCAL\n\nWITH AUCKLAND COUNCI...,267,New Zealand,Green light for traffic safety projects \n LIV...
4,John Howard and Geoffrey Blainey to open Ramsa...,"Sydney, Wednesday 4 April 2018: Emeritus Profe...",479,Australia,"and to open center building \n, Wednesday 4 Ap..."
...,...,...,...,...,...
1599995,Oh Hyun Junior High School Classroom Repair Work,Contract awarded for Oh hyun junior high schoo...,114,India,Oh Hyun Junior High School Classroom Repair Wo...
1599996,Beach lifeguards spot marooned lovers kilometr...,MUMBAI: A young couple perched on the rocks by...,220,India,Beach lifeguards spot marooned lovers kilomete...
1599997,Racial Justice and Civil Rights Organizations ...,New York: American Civil Liberties Union has i...,468,India,Racial Justice and Civil Rights Organizations ...
1599998,Anhui Huiming Machinery Mfg Files Chinese Pate...,FULL TEXT\n\n\n\nPublication Name: Metal & Min...,340,India,Anhui Huiming Machinery Mfg Files Chinese Pate...


In [20]:
# Free memory
del train_check

In [5]:
# Save another shuffled copy as .json
import io
train = train.sample(frac=1).reset_index(drop=True)
json_buffer = io.BytesIO(train.to_json(orient='records', lines=True).encode())

s3 = boto3.client('s3')
s3.upload_fileobj(json_buffer, "compressed-data-sample", "processed_train.json")

In [21]:
test

Unnamed: 0,title,content,wordCount,country,content_cleaned
0,Woman charged with reckless wounding',AN Orient Point woman charged with stabbing he...,88,Australia,Woman charged with reckless wounding' \n AN Or...
1,Landscapes Below: Mapping and the New Science ...,"Cambridge: University of Cambridge, Australia ...",893,Belgium,Mapping and the New Science of Geology \n Camb...
2,TravelSky Technology - H Shares falls for a se...,HONG KONG DAILY STOCK REPORT\n\nTravelSky Tech...,1226,Australia,"falls for second consecutive day, two - day fa..."
3,American Eagle Outfitters chief operating offi...,American Eagle Outfitters (NYSE:AEO) Chief Ope...,1728,Australia,chief operating officer and executive vice pre...
4,Manusu's extraordinary comeback seals bream se...,A last minute decision to enter the final qual...,304,Australia,'s eback seals bream series grand final spot \...
...,...,...,...,...,...
399995,Swimming Teacher,Contract awarded for Swimming teacher \nDate o...,75,India,Swimming Teacher \n Contract awarded for Swimm...
399996,Jobs Growth in Last Fiscal was the Lowest in P...,"CLSA REPORT\n\nThrough three quarters of FY18,...",230,India,Jobs Growth in Last Fiscal was the Lowest in P...
399997,MAIRIN OHS & E CONSULTING PTY LTD Secures cont...,"Australia, Dec. 23 -- Contract Id: 1812163\n\n...",142,India,"contract for \n, Dec. 23 -- Contract d : 18121..."
399998,Stephen Morgan ask the Secretary of State for ...,London: UK Parliament has issued the following...,95,India,"ask the Secretary of for defense, what discuss..."


In [22]:
# Export Test
test.to_csv('s3://compressed-data-sample/processed_test.csv', index=False)

In [23]:
# Check if export correctly
start_time = time.time()
test_check = pd.read_csv('s3://compressed-data-sample/processed_test.csv', 
                          dtype={"wordCount": 'int16', "country":'category'})
end_time = time.time()
print(f'Loading Time: {end_time - start_time} seconds')
test_check

Loading Time: 146.13446378707886 seconds


Unnamed: 0,title,content,wordCount,country,content_cleaned
0,Woman charged with reckless wounding',AN Orient Point woman charged with stabbing he...,88,Australia,Woman charged with reckless wounding' \n AN Or...
1,Landscapes Below: Mapping and the New Science ...,"Cambridge: University of Cambridge, Australia ...",893,Belgium,Mapping and the New Science of Geology \n Camb...
2,TravelSky Technology - H Shares falls for a se...,HONG KONG DAILY STOCK REPORT\n\nTravelSky Tech...,1226,Australia,"falls for second consecutive day, two - day fa..."
3,American Eagle Outfitters chief operating offi...,American Eagle Outfitters (NYSE:AEO) Chief Ope...,1728,Australia,chief operating officer and executive vice pre...
4,Manusu's extraordinary comeback seals bream se...,A last minute decision to enter the final qual...,304,Australia,'s eback seals bream series grand final spot \...
...,...,...,...,...,...
399995,Swimming Teacher,Contract awarded for Swimming teacher \nDate o...,75,India,Swimming Teacher \n Contract awarded for Swimm...
399996,Jobs Growth in Last Fiscal was the Lowest in P...,"CLSA REPORT\n\nThrough three quarters of FY18,...",230,India,Jobs Growth in Last Fiscal was the Lowest in P...
399997,MAIRIN OHS & E CONSULTING PTY LTD Secures cont...,"Australia, Dec. 23 -- Contract Id: 1812163\n\n...",142,India,"contract for \n, Dec. 23 -- Contract d : 18121..."
399998,Stephen Morgan ask the Secretary of State for ...,London: UK Parliament has issued the following...,95,India,"ask the Secretary of for defense, what discuss..."


In [24]:
# Freeup memory
del test_check

In [2]:
# Save another shuffled copy as .json
test = test.sample(frac=1).reset_index(drop=True)
test.to_json('./processed_test.json', orient='records', lines=True)
s3 = boto3.resource('s3')
s3.Bucket('compressed-data-sample').upload_file('./processed_test.json','processed_test.json')