In [0]:
import pandas as pd
import re

### Read in files

In [0]:
evidence = pd.read_csv("evidence.csv")
evidence.columns  = evidence.columns.str.replace('[#,@,&]', '')

issues = pd.read_csv("issues.csv")
issues.columns  = issues.columns.str.replace('[#,@,&]', '')
issues.rename(columns = {"id": "issueIds", "nameTranslated.en": "issueName"}, inplace = True)

themes = pd.read_csv("themes.csv")
themes.columns  = themes.columns.str.replace('[#,@,&]', '')
themes.rename(columns = {"id": "themeIds", "nameTranslated.en": "themeName"}, inplace = True)

In [0]:
evidence.columns

### Merge files

In [0]:
df = evidence.merge(issues[['issueName', 'issueIds']], how = "left", on = ['issueIds'])

In [0]:
df = df.merge(themes[['themeName', 'themeIds']], how = "left", on = ['themeIds'])

In [0]:
# function to clean text
def clean_text(df, text):
    """
    Clean text column
    df = dataframe
    text (string) = column name containing text
    """
    # lowercase text
    df[text] = df[text].str.lower()
 
    # remove URLs
    df[text] = df[text].map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))
 
    # remove URL cutoffs
    df[text] = df[text].map(lambda x: re.sub('\\[^\s]*', ' ', x))
 
    # remove spaces
    df[text] = df[text].map(lambda x: re.sub('\n', ' ', x))
 
    # remove picture URLs
    df[text] = df[text].map(lambda x: re.sub('pic.twitter.com\/[^\s]*', ' ', x))
 
    # remove blog/map type
    df[text] = df[text].map(lambda x: re.sub('blog\/maps\/info\/[^\s]*', ' ', x))
 
    # remove hashtags =
    df[text] = df[text].map(lambda x: re.sub("\#[\w]*", "", x))
 
    # remove and signs
    df[text] = df[text].map(lambda x: re.sub("\&amp;", "", x))
 
    # remove single quotations
    df[text] = df[text].map(lambda x: re.sub("'", "", x))
    df[text] = df[text].map(lambda x: re.sub("'", "", x))
 
    # remove characters that are not word characters or digits
    df[text] = df[text].map(lambda x: re.sub("[^\w\d]", " ", x))
 
    # remove all characters that are not letters
    #df[text] = df[text].map(lambda x: re.sub("[^a-zA-Z]", " ", x))
 
    # remove multiple spaces
    df[text] = df[text].map(lambda x: re.sub("\s{2,6}", " ", x))
 
    # drop duplicate rows
    #df.drop_duplicates(subset='text', keep='first', inplace=True)
 
    # remove multiple spaces
    df[text] = df[text].map(lambda x: re.sub("\s{3,20}", "", x))
 
    return df

In [0]:
df = clean_text(df, "text")

In [0]:
df.head()

In [0]:
pd['text'].isna().sum()

In [0]:
pd['text']

In [0]:
df.to_csv("amp.csv", index = False)

In [0]:
cl = pd.read_csv("https://raw.githubusercontent.com/ucinlp/covid19-data/master/covid_lies.csv")