Skip to content

Commit

Permalink
Multiprocessing and adding urls to missing files
Browse files Browse the repository at this point in the history
  • Loading branch information
ygorg committed Mar 29, 2022
1 parent 3e79004 commit 69f776e
Show file tree
Hide file tree
Showing 4 changed files with 563 additions and 42 deletions.
12 changes: 12 additions & 0 deletions needed_files.url.filelist
@@ -0,0 +1,12 @@
jp0004447 https://www.japantimes.co.jp/news/2019/05/21/world/polish-sextuplets-well-mom-visits-babies
jp0004452 https://www.japantimes.co.jp/news/2019/05/21/asia-pacific/politics-diplomacy-asia-pacific/official-count-shows-widodo-won-indonesian-election-challenger-subianto-disputes-results
jp0005174 https://www.japantimes.co.jp/news/2019/05/22/world/twisters-flip-campers-damage-homes-southern-plains
jp0007563 https://www.japantimes.co.jp/news/2019/06/03/world/social-issues-world/sudan-security-forces-kill-one-outside-khartoums-sit
jp0008774 https://www.japantimes.co.jp/news/2019/06/22/world/georgians-keep-protesting-russians-speech-parliament-despite-speakers-resignation
jp0008778 https://www.japantimes.co.jp/news/2019/06/22/world/science-health-world/u-s-running-beaches-rotting-whales
jp0008784 https://www.japantimes.co.jp/news/2019/06/22/world/crime-legal-world/u-s-supreme-court-rules-prosecutors-need-prove-violation-gun-laws-deliberate
jp0008786 https://www.japantimes.co.jp/news/2019/06/22/world/crime-legal-world/u-s-supreme-court-tosses-murder-conviction-black-man-curtis-flowers-racial-bias
jp0008787 https://www.japantimes.co.jp/news/2019/06/22/world/u-s-blacklists-five-chinese-groups-involved-supercomputing-work
jp0008797 https://www.japantimes.co.jp/news/2019/06/22/world/offbeat-world/adopted-street-dog-named-worlds-ugliest-mutt-california
jp0008798 https://www.japantimes.co.jp/news/2019/06/22/world/offbeat-world/bear-enters-montana-home-takes-nap-closet
jp0008856 https://www.japantimes.co.jp/news/2019/06/25/world/politics-diplomacy-world/istanbul-loss-turkeys-erdogan-vows-listen-peoples-messages
2 changes: 1 addition & 1 deletion scripts/download.sh
@@ -1,3 +1,3 @@
# (146/313)*260000/60/60
# (1367/2987)*260000/60/60
cat ../*.filelist | cut -f 2 | wget --wait=0.5 --random-wait --no-clobber -nv --force-directories --input-file -
cat ../*.filelist | cut -f 2 | wget --wait=0.3 --random-wait --no-clobber -nv --force-directories --input-file -
104 changes: 63 additions & 41 deletions scripts/to_jsonl.py
Expand Up @@ -8,6 +8,7 @@
import fnmatch
import logging
import itertools
from multiprocessing import Pool

import bs4
from tqdm import tqdm
Expand All @@ -26,7 +27,7 @@ def fix_unclosed(tag_name, html):
return re.sub(r'(<{}.*[^/-])>'.format(tag_name), r'\1 />', html)


def convert_jptimes(content):
def convert_jptimes(input_file, content):
content = fix_unclosed('meta', content)
content = fix_unclosed('link', content)
doc = bs4.BeautifulSoup(content, 'html.parser')
Expand All @@ -35,16 +36,22 @@ def convert_jptimes(content):
date = '/'.join(file_name_components[2:5])
categories = file_name_components[5:-1]
file_name = file_name_components[-1]
url = 'http://' + input_file
url = 'https://' + input_file

author = doc.find('meta', attrs={'name': 'author'})['content']
try:
author = doc.find('meta', attrs={'name': 'author'})['content']
except TypeError:
try:
author = doc.find('a', attrs={'class': 'author'}).text
except AttributeError:
logging.warning('jp:No author in {}'.format(input_file))

# Extracting title
title = doc.find('meta', property='og:title')
if not title:
logging.error('no title for {}'.format(input_file))
print(doc.find_all('meta'))
input()
#print(doc.find_all('meta'))
#input()
return
title = re.sub(r'\s+', ' ', title['content']).strip()
title = re.sub(r'\| The Japan Times', '', title)
Expand Down Expand Up @@ -100,7 +107,7 @@ def convert_jptimes(content):
}


def convert_nytimes(content):
def convert_nytimes(input_file, content):
doc = bs4.BeautifulSoup(content, 'html.parser')

file_name_components = input_file.split('/')
Expand All @@ -111,13 +118,16 @@ def convert_nytimes(content):

# Removing script and style tags
for script in doc(['script', 'style', 'link', 'button']):
script.decompose() # rip it out
script.decompose()

try:
# Before 2013
author = doc.find('meta', attrs={'name': 'author'})['content']
except TypeError:
# After 2013
if not doc.find('meta', attrs={'name': 'byl'}):
logging.warning('ny:No author in {}'.format(input_file))
return None
author = doc.find('meta', attrs={'name': 'byl'})['content']
author = author.replace('By ', '')

Expand Down Expand Up @@ -152,7 +162,8 @@ def convert_nytimes(content):
logging.error('no body for {}'.format(input_file))
return
else:
body = ' '.join([re.sub(r'\s+', ' ', p.get_text(separator=' ')).strip() for p in body])
body = ' '.join([re.sub(r'\s+', ' ', p.get_text(separator=' ')).strip()
for p in body])
else:
body = re.sub(r'\s+', ' ', body.get_text(separator=' ')).strip()

Expand Down Expand Up @@ -186,15 +197,36 @@ def convert_nytimes(content):
}


def process_file(input_file, id_=None):
# Loading soup
with open(input_file) as g:
content = g.read()

if 'nytimes' in input_file:
res = convert_nytimes(input_file, content)
elif 'japantimes' in input_file:
res = convert_jptimes(input_file, content)
else:
logging.error(
'Unrecognised file type : {}'.format(input_file))
if res and id_:
res['id'] = id_
return res


if __name__ == '__main__':
import argparse

def arguments():
parser = argparse.ArgumentParser(description='Converts html files to jsonl using a filelist')
parser = argparse.ArgumentParser(
description='Converts html files to jsonl using a filelist')
parser.add_argument(
'-f', '--filelist', type=argparse.FileType('r'),
help='Filelist file. If not given convert every found '
'file into `dataset.jsonl` without id')
parser.add_argument(
'-p', '--parallel', type=int,
help='Number of thread to use for parallel processing')
args = parser.parse_args()
return args

Expand All @@ -205,48 +237,38 @@ def arguments():
logging.info('start converting...')

articles_processed = 0
output_file = '..' + os.sep + 'dataset.jsonl'
output_file = 'dataset.jsonl'
jptimes_dir = 'www.japantimes.co.jp/'
nytimes_dir = 'www.nytimes.co.jp/'

webarchive_dir = 'web.archive.org/'

if args.filelist:
files = [l.strip().split('\t') for l in args.filelist]
files = (l.strip().split('\t') for l in args.filelist)
files = ((i, p.replace('https://', '')) for i, p in files)
files = [(i, p) for i, p in files if os.path.isfile(p)]
args.filelist.close()
output_file = '..' + os.sep + args.filelist.name.replace('url.filelist', 'jsonl')
output_file = args.filelist.name.replace('url.filelist', 'jsonl')
else:
files = itertools.chain(
recursive_iglob(rootdir=jptimes_dir, pattern='[!.]*'),
recursive_iglob(rootdir=nytimes_dir, pattern='*.html')
#recursive_iglob(rootdir=jptimes_dir, pattern='[!.]*'),
#recursive_iglob(rootdir=nytimes_dir, pattern='*.html')
recursive_iglob(rootdir=webarchive_dir, pattern='*.html')
)
files = [(None, p) for p in files]

def process_file_(args):
return process_file(args[1], id_=args[0])

logging.info('Writing to {}'.format(output_file))

with codecs.open(output_file, 'w', 'utf-8') as f:
with Pool(args.parallel) as pool:
process = pool.imap_unordered(process_file_, files)

for input_file in tqdm(files):
if args.filelist:
id_, input_file = input_file
input_file = input_file.replace('http://', '')
if not os.path.isfile(input_file):
continue

# Loading soup
with open(input_file) as g:
content = g.read()

if 'nytimes' in input_file:
res = convert_nytimes(content)
elif 'japantimes' in input_file:
res = convert_jptimes(content)
else:
logging.error('Unrecognised file type : {}'.format(
input_file))
if not res:
continue

if args.filelist:
res['id'] = id_

f.write(json.dumps(res) + '\n')
articles_processed += 1
for res in tqdm(process):
if res:
f.write(json.dumps(res) + '\n')
articles_processed += 1

logging.info('Converted {} articles'.format(articles_processed))
if args.filelist and articles_processed != len(files):
Expand Down

0 comments on commit 69f776e

Please sign in to comment.