In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# replace file path with custom one if necessary
df = pd.read_csv('preprocessed_df.csv')

In [3]:
entity_string = 'Covid'

In [4]:
# define set of synonyms and other keywords which will be searched for in the claim texts and entities 
# note: synonyms or keywords that contain the entitiy string, i.e. 'covid', don't need to be included,
# because they will be found while scanning for the entity string itself
synonyms = [
    "Corona", 
    'vaccin',
    'lockdown'
    'China Virus',
    'Wuhan Virus',
    'pandem',
    ]

In [5]:
# define words that are keyword itself or present in synonyms that might be included in some synonyms
minimal_words = ['Covid']
# discard the synonyms that contain one of the minimal words
new_synonyms = list(filter(lambda s: not any(mw.lower() in s.lower() for mw in minimal_words), synonyms))
# split synonyms into words
# note: ngrams comparison approach only needed for efficient check for multiple different synonyms. with only two synonyms it's okay to just check for substring matching
splitted_synonyms = list(map(lambda s: s.split(' '), synonyms))
# determine length of longest synonym to know how long ngrams will need to be
ngrams_length = max(list(map(lambda words_array: len(words_array), splitted_synonyms)))

In [6]:
def filter_df(row):
    # check if entity string is present in string of all entities
    if type(row.entity_strings) == str and entity_string.lower() in row.entity_strings.lower():
        return True
    # check if entity string is present in text
    elif entity_string.lower() in row.text.lower():
        return True
    # check if any synonym is present in string of all entities
    elif type(row.entity_strings) == str and any(synonym.lower() in row.entity_strings.lower() for synonym in new_synonyms):
        return True
    # check if any synonym is present in text
    elif any(synonym.lower() in row.text.lower() for synonym in new_synonyms):
        return True
    else:
        return False

In [7]:
covid_df = df[df.apply(filter_df, axis=1)]

In [8]:
len(covid_df)

5274

In [9]:
covid_df.to_csv('covid_df.csv')