In [1]:
import pandas as pd
import re

### Read in files

In [2]:
evidence = pd.read_csv("evidence.csv")
evidence.columns  = evidence.columns.str.replace('[#,@,&]', '')

issues = pd.read_csv("issues.csv")
issues.columns  = issues.columns.str.replace('[#,@,&]', '')
issues.rename(columns = {"id": "issueIds", "nameTranslated.en": "issueName"}, inplace = True)

themes = pd.read_csv("themes.csv")
themes.columns  = themes.columns.str.replace('[#,@,&]', '')
themes.rename(columns = {"id": "themeIds", "nameTranslated.en": "themeName"}, inplace = True)

  evidence.columns  = evidence.columns.str.replace('[#,@,&]', '')
  issues.columns  = issues.columns.str.replace('[#,@,&]', '')
  themes.columns  = themes.columns.str.replace('[#,@,&]', '')


In [4]:
evidence.columns

Index(['date', 'attachments', 'textTranslated', 'detectedLanguage',
       'likeCount', 'countries', 'authorId', 'url', 'platform', 'commentCount',
       'socialMediaPostType', 'shareCount', 'createdAt', 'submittedLanguage',
       'themeIds', 'issueIds', 'text', 'updatedAt', 'id', 'index', 'type'],
      dtype='object')

### Merge files

In [96]:
df = evidence.merge(issues[['issueName', 'issueIds']], how = "left", on = ['issueIds'])

In [98]:
df = df.merge(themes[['themeName', 'themeIds']], how = "left", on = ['themeIds'])

In [99]:
# function to clean text
def clean_text(df, text):
    """
    Clean text column
    df = dataframe
    text (string) = column name containing text
    """
    # lowercase text
    df[text] = df[text].str.lower()
 
    # remove URLs
    df[text] = df[text].map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))
 
    # remove URL cutoffs
    df[text] = df[text].map(lambda x: re.sub('\\[^\s]*', ' ', x))
 
    # remove spaces
    df[text] = df[text].map(lambda x: re.sub('\n', ' ', x))
 
    # remove picture URLs
    df[text] = df[text].map(lambda x: re.sub('pic.twitter.com\/[^\s]*', ' ', x))
 
    # remove blog/map type
    df[text] = df[text].map(lambda x: re.sub('blog\/maps\/info\/[^\s]*', ' ', x))
 
    # remove hashtags =
    df[text] = df[text].map(lambda x: re.sub("\#[\w]*", "", x))
 
    # remove and signs
    df[text] = df[text].map(lambda x: re.sub("\&amp;", "", x))
 
    # remove single quotations
    df[text] = df[text].map(lambda x: re.sub("'", "", x))
    df[text] = df[text].map(lambda x: re.sub("'", "", x))
 
    # remove characters that are not word characters or digits
    df[text] = df[text].map(lambda x: re.sub("[^\w\d]", " ", x))
 
    # remove all characters that are not letters
    #df[text] = df[text].map(lambda x: re.sub("[^a-zA-Z]", " ", x))
 
    # remove multiple spaces
    df[text] = df[text].map(lambda x: re.sub("\s{2,6}", " ", x))
 
    # drop duplicate rows
    #df.drop_duplicates(subset='text', keep='first', inplace=True)
 
    # remove multiple spaces
    df[text] = df[text].map(lambda x: re.sub("\s{3,20}", "", x))
 
    return df

In [102]:
df = clean_text(df, "text")

In [103]:
df.head()

Unnamed: 0,date,attachments,textTranslated,detectedLanguage,likeCount,countries,authorId,url,platform,commentCount,...,submittedLanguage,themeIds,issueIds,text,updatedAt,id,index,type,issueName,themeName
0,2023-06-03,2023/06/08/evidence/749a5972-c537-4f93-80de-ad...,[object Object],en,76.0,NG,rhysoneill@gmail.com,https://www.facebook.com/instablog9ja/posts/pf...,Facebook,9.0,...,en,e83rfYgBnMmOXZCbaYt2,g80Cf4gBnMmOXZCbX4tn,actor jamie foxx reportedly par lyzed and bl n...,2023-06-08T16:14:34.393Z,749a5972-c537-4f93-80de-adb0c73527a2,prod-evidence-v1,_doc,COVID-19,Vaccine Side Effects
1,2023-06-05,,[object Object],en,2.0,KE,rhysoneill@gmail.com,https://www.facebook.com/robertalai/posts/pfbi...,Facebook,,...,en,fs31fogBnMmOXZCbnosl,g80Cf4gBnMmOXZCbX4tn,use alot of fresh ginger when it strikes your ...,2023-06-08T17:13:15.540Z,11b135fd-d34f-4bc5-9e51-b4ed28f22a22,prod-evidence-v1,_doc,COVID-19,Home Remedies
2,2023-06-07,,[object Object],en,1.0,ZA,rhysoneill@gmail.com,https://www.facebook.com/permalink.php?story_f...,Facebook,,...,en,,g80Cf4gBnMmOXZCbX4tn,vaccine kills government pushed a unapproved v...,2023-06-08T18:24:00.164Z,9f635c18-49c4-4d26-ada9-5d6ae2706f1c,prod-evidence-v1,_doc,COVID-19,
3,2023-05-17,2023/06/08/evidence/af0d9542-930b-4101-a786-20...,[object Object],en,,"ZA,Africa",rhysoneill@gmail.com,https://t.me/SAAwakened/138313,Telegram,,...,en,es3rfYgBnMmOXZCbaYt2,g80Cf4gBnMmOXZCbX4tn,reminder share share share stop medical col...,2023-06-08T14:30:29.975Z,af0d9542-930b-4101-a786-202710a13f26,prod-evidence-v1,_doc,COVID-19,Corruption
4,2023-06-08,2023/06/08/evidence/085206d6-bbb6-460e-b96e-98...,[object Object],en,,ZA,rhysoneill@gmail.com,,WhatsApp,,...,en,e83rfYgBnMmOXZCbaYt2,,the south african government has established a...,2023-06-08T14:19:09.429Z,ea9ee102-2d4f-48f3-b1f4-13b2cb4714ba,prod-evidence-v1,_doc,,Vaccine Side Effects


In [89]:
pd['text'].isna().sum()

0

In [90]:
pd['text']

0     Actor Jamie Foxx reportedly ‘par@lyzed and bl+...
1     Use alot of fresh ginger,when it strikes your ...
2     VACCINE KILLS!!\nGOVERNMENT PUSHED A UNAPPROVE...
3         💥REMINDER💥 \n\n   ‼️SHARE - SHARE - SHARE‼...
4     The South African government has established a...
                            ...                        
57    I'll never ever forgive big pharma, the WHO, o...
58    Oh give it up. Children are not at risk for CO...
59    Before the introduction of its vaccine, measle...
60    Thekeksociety: NOBEL PRIZE WINNER FOUND DEAD A...
61    Twelve Infants Perish in Clinical Trial for RS...
Name: text, Length: 62, dtype: object

In [104]:
df.to_csv("amp.csv", index = False)

In [2]:
cl = pd.read_csv("https://raw.githubusercontent.com/ucinlp/covid19-data/master/covid_lies.csv")