In [None]:
# @formatter:off
import random

import numpy as np
import sklearn.ensemble
%load_ext autoreload
%autoreload 2
# @formatter:on
import os

print(os.getcwd())
os.chdir('../')

In [None]:
# @formatter:off
files = !ls data/*csv.jsonl
# @formatter:on

In [None]:
files = [f for f in files if 'sample' not in f and 'augmented' not in f and 'processed' not in f]
assert len(files) == 8

In [None]:
import json
from tqdm.notebook import tqdm
from unidecode import unidecode


def load_json(d):
    try:
        return json.loads(d, strict=False)
    except Exception as e:
        print("FEK")
        print(d)
        print(unidecode(d))
        return json.loads(unidecode(d), strict=False)


def load_jsonl(path: str):
    print("loading", path)
    with open(path) as f:
        return [load_json(d) for d in tqdm(f.readlines())]


datasets = [load_jsonl(p) for p in files]


In [None]:
from bs4 import BeautifulSoup


def fix_bbc(d):
    soup = BeautifulSoup(d['html_article'])
    texts = [t for t in soup.find_all(attrs={'data-component': 'text-block'}) if
             t.text.lower() not in ("read more from reality check", "send us your questions")]
    article = '\n'.join(d.text for d in texts)
    links = [a['href'] for t in texts for a in t.find_all(name='a')]
    d['text_article'] = article
    d['links_article'] = links
    return d


datasets[0] = [fix_bbc(d) for d in datasets[0]]

In [None]:
def fix_factchck(d):
    soup = BeautifulSoup(d['html_article'])
    header = [t for t in soup.find_all(name='h3') if t.text]
    texts = [t for t in soup.find_all(name='p') if t.text]
    article = '\n'.join(d.text for d in header + texts)
    #links = [a['href'] for t in texts for a in t.find_all(name='a')]
    d['text_article'] = article
    #d['links_article'] = links
    return d


datasets[0] = [fix_factchck(d) for d in datasets[3]]

In [None]:
import bs4
from copy import deepcopy


def fix_snopes(d):
    new_d = deepcopy(d)
    soup = BeautifulSoup(d['html_article'])
    soup.extract('script')
    texts = []
    for t in soup.find_all(name='p'):
        texts.append(t)

    article = '\n'.join(t.text.strip() for t in texts)
    #links = [a['href'] for t in texts for a in t.find_all(name='a')]
    new_d['text_article'] = article
    return new_d


# print(datasets[6][1336]['text_article'])
# print(20*'----')
datasets[6] = [fix_snopes(d) for d in datasets[6]]

In [None]:
def fix_wp(d):
    new_d = deepcopy(d)
    soup = BeautifulSoup(d['html_article'])
    texts = [t for t in soup.find_all(attrs={'data-qa': ['drop-cap-letter', 'article-header']}) if
             t.text.lower() not in ("read more from reality check", "send us your questions")]
    article = '\n'.join(d.text for d in texts if not any(
        d.text.lower().startswith(n) for n in ("send us facts to check", "sign up for the fact checker", "the fact checker is", "(about our rating")))
    new_d['text_article'] = article
    return new_d


datasets[7] = [fix_wp(d) for d in datasets[7]]

In [None]:
print(datasets[7][1336]['html_article'])

In [None]:
datasets[0][0].keys()

In [None]:
coarse_map = {
    'TRUE': ['accurate', 'correct', 'true', 'legit'],
    'FALSE': ['inaccurate', 'unsupported', 'flawed reasoning', 'incorrect', 'lacks context', 'false', 'wrong', 'scam', 'falso', 'fake',
              'manipulated image', 'altered video', 'doctored image', 'hoax', 'faux', 'altered image', 'pants on fire', 'full flop',
              'legend' 'four pinocchios'],
    'MISLEADING': ['misleading', 'lacks context', 'missing context', 'misleading context', 'misattributed', 'out of context', 'exaggerated',
                   'exaggeration', 'unsubstantiated', 'outdated'],
    'ALMOST': ['imprecise', 'mostly correct', 'mostly accurate', 'correct but…', 'mostly true', 'lacks evidence', 'largely correct',
               'largely accurate', 'close to accurate', 'one pinocchio'],
    'HALF': ['mixture', 'mixed', 'half true', 'partly false', 'half-right, half-wrong', 'half flip', 'partially accurate', 'two pinocchios',
             'half flop'],
    'HARDLY': ['partly false', 'mostly false', 'three pinocchios'],
    'SATIRE': ['satire', 'false satire', 'april fool', 'originated as satire', 'labelled satire']
}

def replace_label(label):
    for k, v in coarse_map.items():
        if label in v:
            return k
    return 'UNKNOWN'


def process_label(label):
    label = str(label).lower().replace('_', ' ').replace('- ', '').replace('.', '').replace('this', '').replace('fasle', 'false')
    label = label.replace('may be', '').replace('the', '').replace('claim', '').replace(' is ', ' ').replace('just', '').replace('flat', '').strip()
    if len(label) < 40:
        if any(label.startswith(n) for n in ['no', 'false']):
            return 'false'
        elif any(c in label for c in ['misleading', 'context']):
            return 'misleading'
    return label

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


def barh_from_counter(ctrs):
    index = ['TRUE', 'ALMOST', 'HALF', 'HARDLY', 'FALSE', 'MISLEADING', 'SATIRE']
    df = pd.DataFrame({k: [v.get(i, 0) for i in index] for k, v in ctrs.items()}, index)
    df.plot.barh()
    plt.show()


In [None]:
from collections import Counter


def label_distribution_ds(ds, name, cutoff=1):
    labels = [replace_label(process_label(d['cR_textualRating'])) for d in ds]
    full_ctr = Counter(labels)
    ctr = {x: y for x, y in full_ctr.most_common() if y >= cutoff}
    print('Dataset:', name)
    print(len(ctr))
    return ctr


ctrs = dict()
for ds, f in zip(datasets, files):

    if all(n not in f for n in ['fullfact', 'bbc']):
        ctrs[f] = label_distribution_ds(ds, f)

barh_from_counter(ctrs)



In [None]:
def augment_labels(ds):
    for d in ds:
        d['label'] = replace_label(process_label(d['cR_textualRating']))


for ds, f in zip(datasets, files):
    if all(n not in f for n in ['fullfact', 'bbc']):
        augment_labels(ds)


In [None]:
import random


def does_output_make_sense(ds, name, k=5):
    print('Dataset:', name)
    for d in random.sample(ds, k):
        print(' rating:', d['cR_textualRating'])
        print(' title:', d['cR_title'])
        print()
    print('----' * 20)


for ds, f in zip(datasets, files):
    does_output_make_sense(ds, f, 10)


In [None]:
def augment_explanation(ds, concat=False):
    for d in ds:
        if concat:
            explanation = '. '.join((str(d['cR_textualRating']), d['cR_title']))
        else:
            explanation = str(d['cR_textualRating'])
        d['explanation'] = explanation


for ds, f in zip(datasets, files):
    augment_explanation(ds, concat=any(n in f for n in ['climatefeedback', 'factcheck']))


In [None]:
from handystuff.loaders import write_jsonl


def write_out_ds(ds, f):
    f = f.replace('data/', 'data/augmented-')
    print(f)
    write_jsonl(ds, f)


for ds, f in zip(datasets, files):
    write_out_ds(ds, f)


In [None]:
#@formatter:off
!cd data/ && for i in augmented-bbc.csv.jsonl augmented-climatefeedback.csv.jsonl augmented-factcheck.csv.jsonl augmented-fullfact.csv.jsonl augmented-nytimes.csv.jsonl augmented-politifact.csv.jsonl augmented-snopes.csv.jsonl augmented-washingtonpost.csv.jsonl; do cat $i | mlr --ijsonl --ojsonl cut -f text,text_article,label,explanation > processed-$i; done
#@formatter:on