In [1]:
import os, csv, tqdm, re, json, datetime
from urllib.parse import urlparse
from pathlib import Path

DEST = Path("../data/news/webhose/csv/")
FILES = sorted(Path("../data/news/webhose/src/").glob('*.json'))

In [2]:
def get_topic(path: str) -> str:
    """
    :returns: cleaned topic or '' if no topics were found
    """
    topic = path.strip('/').split("/")[0]
    if not re.search('[a-zA-Z]', topic) or re.search(r'(\?|=)', topic):
        return ''
    whitespaces = len(re.findall(r"(\s|-|_)", topic))
    low_case = len(re.findall(r"[a-z]", topic))
    upper_case = len(re.findall(r"[A-Z]", topic))
    numbers = len(re.findall(r"\d", topic))
    if upper_case > whitespaces and low_case != 0:
        return ''
    if low_case == 0 and upper_case == 0:
        return ''
    if re.search(r"^\d", topic) and numbers > 4:
        return ''
    if upper_case and numbers and not whitespaces:
        return ''
    if re.search(r"(\.html|\.php|\.xml|\.csv)", topic):
        return ''
    if len(topic) <= 1:
        return ''
    return topic.lower()
print('topic=', get_topic('/world/02oct2016/gulens.html'))
print('topic=', get_topic('/msg/lv/real-estate/flats/riga/centre/fehjp.html'))

topic= world
topic= msg


In [3]:
def process_file(data):
    crawled = datetime.datetime.strptime(data['crawled'].split('+', 1)[0], "%Y-%m-%dT%H:%M:%S.%f")
    published = datetime.datetime.strptime(data['published'].split('+', 1)[0], "%Y-%m-%dT%H:%M:%S.%f")
    #print(crawled, published)
    dt = published
    site = data['thread']['site']
    parts = urlparse(data['url'])
    fullsite = parts.hostname
    fullsite = data['thread']['site_full']
    edition = (fullsite+'\n').replace(site+'\n', '', 1).strip('.')
    if edition.startswith('www.'):
        edition = edition[4:]
    elif edition == 'www':
        edition = ''
    ym = f"{published.year}-{published.month:02d}"
    fn = DEST / ym / f'{ym}-{site}.csv'
    # data['thread']['section_title'],
    return fn, (
        published or crawled,
        data['url'],
        edition,
        get_topic(parts.path),
        data['title'],
        data['text']
    )

In [4]:
from collections import defaultdict
items = defaultdict(list)
for fpath in tqdm.tqdm_notebook(FILES):
    data = json.load(open(fpath, 'rb'))
    fn, row = process_file(data)
    items[fn].append(row)

HBox(children=(IntProgress(value=0, max=291584), HTML(value='')))




In [5]:
for fn, rows in tqdm.tqdm_notebook(sorted(items.items(), key=lambda x:x[0])):
    with open(fn, 'w') as f:
        w = csv.writer(f)
        w.writerow(("datetime", "url", "edition", "topics", "title", "text"))
        for r in sorted(rows):
            w.writerow(r)

HBox(children=(IntProgress(value=0, max=305), HTML(value='')))


