Multiprocessing and adding urls to missing files

ygorg · Mar 29, 2022 · 69f776e · 69f776e
1 parent 3e79004
commit 69f776e
Show file tree

Hide file tree

Showing 4 changed files with 563 additions and 42 deletions.
diff --git a/needed_files.url.filelist b/needed_files.url.filelist
@@ -0,0 +1,12 @@
+jp0004447	https://www.japantimes.co.jp/news/2019/05/21/world/polish-sextuplets-well-mom-visits-babies
+jp0004452	https://www.japantimes.co.jp/news/2019/05/21/asia-pacific/politics-diplomacy-asia-pacific/official-count-shows-widodo-won-indonesian-election-challenger-subianto-disputes-results
+jp0005174	https://www.japantimes.co.jp/news/2019/05/22/world/twisters-flip-campers-damage-homes-southern-plains
+jp0007563	https://www.japantimes.co.jp/news/2019/06/03/world/social-issues-world/sudan-security-forces-kill-one-outside-khartoums-sit
+jp0008774	https://www.japantimes.co.jp/news/2019/06/22/world/georgians-keep-protesting-russians-speech-parliament-despite-speakers-resignation
+jp0008778	https://www.japantimes.co.jp/news/2019/06/22/world/science-health-world/u-s-running-beaches-rotting-whales
+jp0008784	https://www.japantimes.co.jp/news/2019/06/22/world/crime-legal-world/u-s-supreme-court-rules-prosecutors-need-prove-violation-gun-laws-deliberate
+jp0008786	https://www.japantimes.co.jp/news/2019/06/22/world/crime-legal-world/u-s-supreme-court-tosses-murder-conviction-black-man-curtis-flowers-racial-bias
+jp0008787	https://www.japantimes.co.jp/news/2019/06/22/world/u-s-blacklists-five-chinese-groups-involved-supercomputing-work
+jp0008797	https://www.japantimes.co.jp/news/2019/06/22/world/offbeat-world/adopted-street-dog-named-worlds-ugliest-mutt-california
+jp0008798	https://www.japantimes.co.jp/news/2019/06/22/world/offbeat-world/bear-enters-montana-home-takes-nap-closet
+jp0008856	https://www.japantimes.co.jp/news/2019/06/25/world/politics-diplomacy-world/istanbul-loss-turkeys-erdogan-vows-listen-peoples-messages
diff --git a/scripts/download.sh b/scripts/download.sh
@@ -1,3 +1,3 @@
 # (146/313)*260000/60/60
 # (1367/2987)*260000/60/60
-cat ../*.filelist | cut -f 2 | wget --wait=0.5 --random-wait --no-clobber -nv --force-directories --input-file -
+cat ../*.filelist | cut -f 2 | wget --wait=0.3 --random-wait --no-clobber -nv --force-directories --input-file -
diff --git a/scripts/to_jsonl.py b/scripts/to_jsonl.py
@@ -8,6 +8,7 @@
 import fnmatch
 import logging
 import itertools
+from multiprocessing import Pool
 
 import bs4
 from tqdm import tqdm
@@ -26,7 +27,7 @@ def fix_unclosed(tag_name, html):
     return re.sub(r'(<{}.*[^/-])>'.format(tag_name), r'\1 />', html)
 
 
-def convert_jptimes(content):
+def convert_jptimes(input_file, content):
     content = fix_unclosed('meta', content)
     content = fix_unclosed('link', content)
     doc = bs4.BeautifulSoup(content, 'html.parser')
@@ -35,16 +36,22 @@ def convert_jptimes(content):
     date = '/'.join(file_name_components[2:5])
     categories = file_name_components[5:-1]
     file_name = file_name_components[-1]
-    url = 'http://' + input_file
+    url = 'https://' + input_file
 
-    author = doc.find('meta', attrs={'name': 'author'})['content']
+    try:
+        author = doc.find('meta', attrs={'name': 'author'})['content']
+    except TypeError:
+        try:
+            author = doc.find('a', attrs={'class': 'author'}).text
+        except AttributeError:
+            logging.warning('jp:No author in {}'.format(input_file))
 
     # Extracting title
     title = doc.find('meta', property='og:title')
     if not title:
         logging.error('no title for {}'.format(input_file))
-        print(doc.find_all('meta'))
-        input()
+        #print(doc.find_all('meta'))
+        #input()
         return
     title = re.sub(r'\s+', ' ', title['content']).strip()
     title = re.sub(r'\| The Japan Times', '', title)
@@ -100,7 +107,7 @@ def convert_jptimes(content):
     }
 
 
-def convert_nytimes(content):
+def convert_nytimes(input_file, content):
     doc = bs4.BeautifulSoup(content, 'html.parser')
 
     file_name_components = input_file.split('/')
@@ -111,13 +118,16 @@ def convert_nytimes(content):
 
     # Removing script and style tags
     for script in doc(['script', 'style', 'link', 'button']):
-        script.decompose()  # rip it out
+        script.decompose()
 
     try:
         # Before 2013
         author = doc.find('meta', attrs={'name': 'author'})['content']
     except TypeError:
         # After 2013
+        if not doc.find('meta', attrs={'name': 'byl'}):
+            logging.warning('ny:No author in {}'.format(input_file))
+            return None
         author = doc.find('meta', attrs={'name': 'byl'})['content']
         author = author.replace('By ', '')
 
@@ -152,7 +162,8 @@ def convert_nytimes(content):
             logging.error('no body for {}'.format(input_file))
             return
         else:
-            body = ' '.join([re.sub(r'\s+', ' ', p.get_text(separator=' ')).strip() for p in body])
+            body = ' '.join([re.sub(r'\s+', ' ', p.get_text(separator=' ')).strip()
+                            for p in body])
     else:
         body = re.sub(r'\s+', ' ', body.get_text(separator=' ')).strip()
 
@@ -186,15 +197,36 @@ def convert_nytimes(content):
     }
 
 
+def process_file(input_file, id_=None):
+    # Loading soup
+    with open(input_file) as g:
+        content = g.read()
+
+    if 'nytimes' in input_file:
+        res = convert_nytimes(input_file, content)
+    elif 'japantimes' in input_file:
+        res = convert_jptimes(input_file, content)
+    else:
+        logging.error(
+            'Unrecognised file type : {}'.format(input_file))
+    if res and id_:
+        res['id'] = id_
+    return res
+
+
 if __name__ == '__main__':
     import argparse
 
     def arguments():
-        parser = argparse.ArgumentParser(description='Converts html files to jsonl using a filelist')
+        parser = argparse.ArgumentParser(
+            description='Converts html files to jsonl using a filelist')
         parser.add_argument(
             '-f', '--filelist', type=argparse.FileType('r'),
             help='Filelist file. If not given convert every found '
                  'file into `dataset.jsonl` without id')
+        parser.add_argument(
+            '-p', '--parallel', type=int,
+            help='Number of thread to use for parallel processing')
         args = parser.parse_args()
         return args
 
@@ -205,48 +237,38 @@ def arguments():
     logging.info('start converting...')
 
     articles_processed = 0
-    output_file = '..' + os.sep + 'dataset.jsonl'
+    output_file = 'dataset.jsonl'
     jptimes_dir = 'www.japantimes.co.jp/'
     nytimes_dir = 'www.nytimes.co.jp/'
-
+    webarchive_dir = 'web.archive.org/'
+
     if args.filelist:
-        files = [l.strip().split('\t') for l in args.filelist]
+        files = (l.strip().split('\t') for l in args.filelist)
+        files = ((i, p.replace('https://', '')) for i, p in files)
+        files = [(i, p) for i, p in files if os.path.isfile(p)]
         args.filelist.close()
-        output_file = '..' + os.sep + args.filelist.name.replace('url.filelist', 'jsonl')
+        output_file = args.filelist.name.replace('url.filelist', 'jsonl')
     else:
         files = itertools.chain(
-            recursive_iglob(rootdir=jptimes_dir, pattern='[!.]*'),
-            recursive_iglob(rootdir=nytimes_dir, pattern='*.html')
+            #recursive_iglob(rootdir=jptimes_dir, pattern='[!.]*'),
+            #recursive_iglob(rootdir=nytimes_dir, pattern='*.html')
+            recursive_iglob(rootdir=webarchive_dir, pattern='*.html')
         )
+        files = [(None, p) for p in files]
+
+    def process_file_(args):
+        return process_file(args[1], id_=args[0])
+
+    logging.info('Writing to {}'.format(output_file))
 
     with codecs.open(output_file, 'w', 'utf-8') as f:
+        with Pool(args.parallel) as pool:
+            process = pool.imap_unordered(process_file_, files)
 
-        for input_file in tqdm(files):
-            if args.filelist:
-                id_, input_file = input_file
-                input_file = input_file.replace('http://', '')
-                if not os.path.isfile(input_file):
-                    continue
-
-            # Loading soup
-            with open(input_file) as g:
-                content = g.read()
-
-            if 'nytimes' in input_file:
-                res = convert_nytimes(content)
-            elif 'japantimes' in input_file:
-                res = convert_jptimes(content)
-            else:
-                logging.error('Unrecognised file type : {}'.format(
-                    input_file))
-            if not res:
-                continue
-
-            if args.filelist:
-                res['id'] = id_
-
-            f.write(json.dumps(res) + '\n')
-            articles_processed += 1
+            for res in tqdm(process):
+                if res:
+                    f.write(json.dumps(res) + '\n')
+                    articles_processed += 1
 
     logging.info('Converted {} articles'.format(articles_processed))
     if args.filelist and articles_processed != len(files):