In [1]:
import datasets

print(f"Running on datasets {datasets.__version__}v")

Running on datasets 1.8.0v


In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import unicodedata
import re
import spacy

from datasets import load_dataset

from tqdm.notebook import tqdm
from IPython.display import clear_output

In [3]:
ds = load_dataset("health_fact", "regular")
clear_output()

In [4]:
train = ds['train'].to_pandas()
test = ds['test'].to_pandas()
val = ds['validation'].to_pandas()

train.shape, test.shape, val.shape

((9832, 9), (1235, 9), (1225, 9))

In [5]:
nums = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
train.drop(train[~train.claim_id.str.startswith(tuple(nums))].index, inplace=True)
train.reset_index(drop=True, inplace=True)
test.drop(test[~test.claim_id.str.startswith(tuple(nums))].index, inplace=True)
test.reset_index(drop=True, inplace=True)
val.drop(val[~val.claim_id.str.startswith(tuple(nums))].index, inplace=True)
val.reset_index(drop=True, inplace=True)

train.shape, test.shape, val.shape

((9814, 9), (1235, 9), (1217, 9))

In [6]:
cols_to_drop = ['date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'subjects']

claim_train_df = train.drop(columns=cols_to_drop, axis=1)
claim_train_df.rename(columns={"claim": "sentence"}, inplace=True)

claim_test_df = test.drop(columns=cols_to_drop, axis=1)
claim_test_df.rename(columns={"claim": "sentence"}, inplace=True)

claim_val_df = val.drop(columns=cols_to_drop, axis=1)
claim_val_df.rename(columns={"claim": "sentence"}, inplace=True)

claim_train_df.to_csv('../data/claim_train_df.csv', index=False)
claim_val_df.to_csv('../data/claim_val_df.csv', index=False)
claim_test_df.to_csv('../data/claim_test_df.csv', index=False)

In [7]:
def remove_html_tags(text):
    return BeautifulSoup(text, 'html.parser').get_text()

def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def remove_special_characters(text): 
    return re.sub(r'[^a-zA-z0-9.,!?/:;\"\'\s]', '', text)

def remove_extra_whitespace_tabs(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

def preprocess(text):
    processed_text = remove_html_tags(text)
    processed_text = remove_accented_chars(processed_text)
    processed_text = remove_special_characters(processed_text)
    processed_text = remove_extra_whitespace_tabs(processed_text)
    return processed_text

def create_dataframe(df):
    nlp = spacy.load('en')
    return_df = pd.DataFrame(columns=['claim_id', 'sent_id', 'sentence'])
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        tokens = nlp(row['sentence'].strip())
        sentences = [sent for sent in tokens.sents]
        if len(sentences) > 0:
            claim_ids = [row['claim_id']] * len(sentences)
            sent_ids = list(range(len(sentences)))
            data = {'claim_id': claim_ids, 'sent_id': sent_ids, 'sentence': sentences}
            return_df = return_df.append(pd.DataFrame(data), ignore_index=True)
    return return_df

def generate_main_text_dataframe(main_train, main_test, main_val):
    cols_to_drop = ['date_published', 'explanation', 'fact_checkers', 'claim', 'sources', 'subjects', 'label']

    main_train.drop(columns=cols_to_drop, axis=1, inplace=True)
    main_train.rename(columns={"main_text": "sentence"}, inplace=True)

    main_test.drop(columns=cols_to_drop, axis=1, inplace=True)
    main_test.rename(columns={"main_text": "sentence"}, inplace=True)

    main_val.drop(columns=cols_to_drop, axis=1, inplace=True)
    main_val.rename(columns={"main_text": "sentence"}, inplace=True)

    main_train['sentence'] = main_train['sentence'].apply(lambda x: preprocess(x))
    main_test['sentence'] = main_test['sentence'].apply(lambda x: preprocess(x))
    main_val['sentence'] = main_val['sentence'].apply(lambda x: preprocess(x))
    
    train_df = create_dataframe(main_train)
    test_df = create_dataframe(main_test)
    val_df = create_dataframe(main_val)
    return train_df, test_df, val_df

In [8]:
main_text_train_df, main_text_test_df, main_text_val_df = generate_main_text_dataframe(train, test, val)

HBox(children=(FloatProgress(value=0.0, max=9814.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1235.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1217.0), HTML(value='')))




In [29]:
main_text_train_df.to_csv('../data/main_text_train_df.csv', index=False)
main_text_test_df.to_csv('../data/main_text_test_df.csv', index=False)
main_text_val_df.to_csv('../data/main_text_val_df.csv', index=False)