In [1]:
import polars as pl
from tqdm import tqdm
from typing import Dict, Any

base = '../data/Monant/data/{}.csv'
relevant_cols = {
    "articles": ['id', 'title', 'body', 'source_id', 'published_at'],
    "claims": ['id', 'statement', 'description', 'rating', 'created_at'],
    "fact_checking_articles": ['id', 'claim', 'description', 'rating', 'source_id', 'published_at'],
    "sources": ['id', 'name'],
    "entity_annotations": ['id', 'annotation_type_id', 'entity_type', 'entity_id', 'value'],
    "relation_annotations": ['id', 'annotation_type_id', 'source_entity_type', 'source_entity_id', 'target_entity_type', 'target_entity_id', 'value']
}

data: Dict[str, pl.DataFrame] = {}
for entity, cols in tqdm(relevant_cols.items()):
    data[entity] = pl.read_csv(base.format(entity), columns=cols)

lookup: Dict[str, Dict[int, Dict[str, Any]]] = {
    name: {row["id"]: row for row in df.to_dicts()} for name, df in tqdm(data.items())
}

100%|██████████| 6/6 [00:08<00:00,  1.42s/it]
100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


In [2]:
print("Articles Columns:", ', '.join(data['articles'].columns), sep='\n', end='\n\n')
print("Claims Columns:", ', '.join(data['claims'].columns), sep='\n', end='\n\n')
print("Fact Checking Articles Columns:", ', '.join(data['fact_checking_articles'].columns), sep='\n', end='\n\n')
print("Sources Columns:", ', '.join(data['sources'].columns), sep='\n', end='\n\n')
print("Entity Annotations Columns:", ', '.join(data['entity_annotations'].columns), sep='\n', end='\n\n')
print("Relation Annotations Columns:", ', '.join(data['relation_annotations'].columns), sep='\n', end='\n\n')

Articles Columns:
id, title, body, source_id, published_at

Claims Columns:
id, statement, description, rating, created_at

Fact Checking Articles Columns:
id, claim, description, rating, source_id, published_at

Sources Columns:
id, name

Entity Annotations Columns:
id, annotation_type_id, entity_type, entity_id, value

Relation Annotations Columns:
id, annotation_type_id, source_entity_type, source_entity_id, target_entity_type, target_entity_id, value



In [3]:
dfs = {}

for annotation_type_id, group in data['relation_annotations'].group_by('annotation_type_id'):
    rs = []
    for row in tqdm(group.to_dicts(), desc=f"Group {annotation_type_id[0]}"):
        source_type = row.pop('source_entity_type')
        try:
            source = lookup[source_type][row.pop('source_entity_id')].copy()
        except KeyError: continue
        source = {f'{source_type}_{key}': v for key, v in source.items()}
        target_type = row.pop('target_entity_type')
        try:
            target = lookup[target_type][row.pop('target_entity_id')].copy()
        except KeyError: continue
        target = {f'{target_type}_{key}': v for key, v in target.items()}
        value_dict = row.pop('value').replace('null', 'None')
        value = eval(value_dict)
        if value: value = value['value']
        else: value = None
        new_row = {**row, **source, **target, 'value': value}

        rs.append(new_row)
    dfs[annotation_type_id[0]] = pl.DataFrame(rs)

for annotation_type_id, group in data['entity_annotations'].group_by('annotation_type_id'):
    rs = []
    if annotation_type_id[0] == 6: continue
    for row in tqdm(group.to_dicts(), desc=f"Group {annotation_type_id[0]}"):
        source_type = row.pop('entity_type')
        try:
            source = lookup[source_type][row.pop('entity_id')].copy()
        except KeyError: continue
        source = {f'{source_type}_{key}': v for key, v in source.items()}
        value_dict = row.pop('value').replace('null', 'None')
        value = eval(value_dict)
        if value: value = value['value']
        else: value = None
        new_row = {**row, **source, 'value': value}

        rs.append(new_row)
    dfs[annotation_type_id[0]] = pl.DataFrame(rs)

# 6 is a special case
rs = []
for row in tqdm(data['entity_annotations'].filter(pl.col('annotation_type_id') == 6).to_dicts(), desc=f"Group 6"):
    source_type = row.pop('entity_type')
    try:
        source = lookup[source_type][row.pop('entity_id')].copy()
    except KeyError: continue
    source = {f'{source_type}_{key}': v for key, v in source.items()}
    value_dict = row.pop('value').replace('null', 'None')
    value = eval(value_dict)['claims']
    for claim in value:
        try:
            target = lookup['claims'][claim.pop('claim_id')].copy()
        except KeyError: continue
        target = {f'claims_{key}': v for key, v in target.items()}
        new_row = {**row, **source, **target, **claim}
        rs.append(new_row)
dfs[6] = pl.DataFrame(rs)

Group 3: 100%|██████████| 416899/416899 [00:11<00:00, 35501.61it/s]
Group 4: 100%|██████████| 3292/3292 [00:00<00:00, 30167.20it/s]
Group 2: 100%|██████████| 417216/417216 [00:12<00:00, 33097.98it/s]
Group 1: 100%|██████████| 70/70 [00:00<00:00, 8223.20it/s]
Group 6: 100%|██████████| 110343/110343 [00:02<00:00, 48737.47it/s]


In [4]:
import os
os.makedirs('../data/processed', exist_ok=True)

for annotation_type, df in dfs.items():
    print(f"Writing {annotation_type}")
    df.write_csv(f'../data/processed/monant_{annotation_type}.csv')

Writing 3
Writing 4
Writing 2
Writing 1
Writing 6


In [48]:
# Preprocessing

# 1. Merge both for a single-call evaluation
df2 = dfs[2].rename({'value':'presence', 'id': '2_id'})
df3 = dfs[3].rename({'value':'stance', 'id': '3_id'})

df = df2.join(df3, on=["articles_id", "claims_id"], how="inner")
df = df.select([col for col in df.columns if not col.endswith("_right")])

# 2. Unify presence values
def map_presence(value):
    if value in ["yes", "present", "suggestive"]: return "yes"
    elif value in ["no", "not-present"]: return "no"
    return value

df = df.with_columns(df['presence'].map_elements(map_presence).alias('presence'))

# 3. Remove not-determined-yet presence and stance
df = df.filter(pl.col('presence') != 'not-determined-yet')
df = df.filter(pl.col('stance') != 'not-determined-yet')

df.write_csv('../data/processed/monant.csv')

  df = df.with_columns(df['presence'].map_elements(map_presence).alias('presence'))


In [None]:
# all rows where 'articles_id' and 'claims_id' are duplicated
pair_counts = df.group_by(["articles_id", "claims_id"]).count()
valid_pairs = pair_counts.filter(pair_counts["count"] > 1).select(["articles_id", "claims_id"])
df.join(valid_pairs, on=["articles_id", "claims_id"])

  pair_counts = df.group_by(["articles_id", "claims_id"]).count()


2_id,annotation_type_id,target_entity_type,articles_id,articles_title,articles_body,articles_source_id,articles_published_at,claims_id,claims_statement,claims_description,claims_rating,claims_created_at,presence,3_id,stance
i64,i64,str,i64,str,str,i64,str,i64,str,str,str,str,str,i64,str
594146,2,"""claims""",959847,"""Could Dirty Air Help Speed Alz…","""By Amy Norton HealthDay Report…",186,"""2020-12-03 08:00:00+01""",1872,"""Is air pollution linked to gre…","""<p><a href=""https://www.thegua…","""true""","""2019-12-13 14:29:38.661545+01""","""yes""",594230,"""supporting"""
594146,2,"""claims""",959847,"""Could Dirty Air Help Speed Alz…","""By Amy Norton HealthDay Report…",186,"""2020-12-03 08:00:00+01""",1872,"""Is air pollution linked to gre…","""<p><a href=""https://www.thegua…","""true""","""2019-12-13 14:29:38.661545+01""","""yes""",902823,"""supporting"""
594129,2,"""claims""",959847,"""Could Dirty Air Help Speed Alz…","""By Amy Norton HealthDay Report…",186,"""2020-12-03 08:00:00+01""",1870,"""Does air pollution cause Alzhe…","""<p>An article published in Sep…","""true""","""2019-12-13 14:29:38.422382+01""","""yes""",594213,"""supporting"""
594129,2,"""claims""",959847,"""Could Dirty Air Help Speed Alz…","""By Amy Norton HealthDay Report…",186,"""2020-12-03 08:00:00+01""",1870,"""Does air pollution cause Alzhe…","""<p>An article published in Sep…","""true""","""2019-12-13 14:29:38.422382+01""","""yes""",902822,"""supporting"""
595508,2,"""claims""",962392,"""Are Scientists Close to a 'Uni…","""Scientists say they may be get…",186,"""2020-12-09 08:00:00+01""",4593,"""Is the Flu Vaccine effective?""","""<p>I read an <a href=""https://…","""true""","""2019-12-13 14:35:49.362022+01""","""yes""",595689,"""neutral"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
571395,2,"""claims""",922893,"""FluMist Vs. Flu Shots for Kids…","""While getting a flu vaccine ea…",221,"""2020-10-18 11:18:30+02""",6292,"""The flu shot causes false posi…","""Claim that flu shot causes fal…","""false""","""2020-04-16 20:22:31.611459+02""","""yes""",571625,"""contradicting"""
575415,2,"""claims""",931121,"""VACCINES KILL: Study finds pos…",""" (Natural News)  An Oct. 1…",145,"""2020-11-03 00:00:00+01""",6279,"""flu vaccine increases risk of …","""Claim that flu vaccine increas…","""false""","""2020-04-16 20:22:29.90673+02""","""yes""",902808,"""supporting"""
575415,2,"""claims""",931121,"""VACCINES KILL: Study finds pos…",""" (Natural News)  An Oct. 1…",145,"""2020-11-03 00:00:00+01""",6279,"""flu vaccine increases risk of …","""Claim that flu vaccine increas…","""false""","""2020-04-16 20:22:29.90673+02""","""yes""",577309,"""contradicting"""
586475,2,"""claims""",946059,"""Aducanumab isn&#8217;t the sim…","""This year’s Clinical Trials on…",165,"""2019-12-20 04:45:59+01""",449,"""Is alzheimers genetic?""","""<p><br></p>""","""true""","""2019-12-13 14:26:32.150536+01""","""yes""",902755,"""supporting"""
