In [66]:
import pandas as pd
import json
import requests

In [67]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from sklearn.feature_extraction import text 

## Read in data

In [68]:
# read in data 
amp = pd.read_csv("amp_viv.csv")

# fill NAs with manual labels
amp['issueName'] = amp['issueName'].fillna(amp['manuarl_issueName'])

# check NA values
amp['issueName'].isna().sum()

7

In [69]:
amp.columns

Index(['date', 'attachments', 'textTranslated', 'detectedLanguage',
       'likeCount', 'countries', 'authorId', 'url', 'platform', 'commentCount',
       'socialMediaPostType', 'shareCount', 'createdAt', 'submittedLanguage',
       'themeIds', 'issueIds', 'text', 'updatedAt', 'id', 'index', 'type',
       'issueName', 'themeName', 'manuarl_issueName', 'manual_themeName',
       'Unnamed: 25', 'Unnamed: 26'],
      dtype='object')

In [70]:
# read in covidlies data
cl = pd.read_csv("https://raw.githubusercontent.com/ucinlp/covid19-data/master/covid_lies.csv")

In [9]:
cl["misconception_id"].value_counts()

3     286
25    192
10    188
15    145
2     100
     ... 
13     99
6      99
4      99
39     98
38     90
Name: misconception_id, Length: 62, dtype: int64

In [6]:
cl.rename(columns={"misconception": "text" }, inplace = True)
cl["issueName"] = ""
cl["themeName"] = ""

### read in misconception labels

In [72]:
# function to read in json file from url
def read_jsonl_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for successful response (status code 200)

        json_list = []
        for line in response.iter_lines():
            # Decode each line from bytes to string
            line = line.decode("utf-8")
            if line.strip():  # Check if the line is not empty
                # Parse each line as a JSON object and add it to the list
                json_list.append(json.loads(line))
        
        return json_list
    except requests.exceptions.RequestException as e:
        print("Error occurred while fetching data:", e)
        return None
    except json.JSONDecodeError as e:
        print("Error occurred while parsing JSON data:", e)
        return None

In [73]:
# read in data file
url = "https://raw.githubusercontent.com/ucinlp/covid19-data/master/misconceptions.jsonl"
mis_data = read_jsonl_from_url(url)

In [74]:
# convert to dataframe
mis_df = pd.DataFrame(mis_data)

# unlist cateogry column
mis_df["category"] = mis_df["category"].apply(lambda x: "".join(x))

In [88]:
mis_df["category"].value_counts()

Medical Misinformation    41
Miscellaneous              9
Government                 6
Conspiracy Theories        2
Combative Efforts          2
Statistics                 1
Accidental leakage         1
Name: category, dtype: int64

### Combine misinformation labels with covid lies data

In [38]:
mis_cl = cl.merge(mis_df, how = "left", left_on = "misconception_id", right_on = "id")

In [41]:
mis_cl["category"].value_counts()

Medical Misinformation    4309
Miscellaneous              899
Government                 599
Conspiracy Theories        385
Combative Efforts          200
Accidental leakage         100
Statistics                  99
Name: category, dtype: int64

## Combine datasets

In [44]:
df = pd.concat([amp[["text", "issueName", "themeName"]], cl[["text", "issueName", "themeName"]]])

In [45]:
df.shape

(6653, 3)

## Build model to predict issue

In [57]:
#amp.dropna(subset = ["themeName"], inplace = True)

In [47]:
# Set X and Y variables
X = mis_cl["text"]
y = mis_cl["category"]

In [48]:
# Split into train and test set
X_train, X_test, y_train, y_test= train_test_split(X, y,
                                                    stratify = y,
                                                   random_state = 42)

In [52]:
# for future if we want to add more stop words
# ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)))


# set up pipeline
pipe = Pipeline([
            ('tvec', TfidfVectorizer(ngram_range = (1,5), stop_words = list(text.ENGLISH_STOP_WORDS))),
            ("bag", BaggingClassifier(random_state = 42))

])

# param options
params = {
    "bag__n_estimators": [100], # default is 10
   # "bag__max_features": [ 1, 5, 10 , 30] # default is 1
}

# run gridsearch
gs = GridSearchCV(pipe, params, cv=5, n_jobs= 3)

In [53]:
# run model
gs.fit(X_train, y_train)

In [54]:
model = gs.best_estimator_


In [55]:
model.score(X_test, y_test)


1.0

### Predict on amp data

In [56]:
X_new = amp["text"]

In [57]:
amp["predictedCategory"] = model.predict(X_new)

In [59]:
amp["predictedCategory"].value_counts()

Medical Misinformation    57
Government                 3
Miscellaneous              1
Accidental leakage         1
Name: predictedCategory, dtype: int64

In [58]:
amp.head()

Unnamed: 0,date,attachments,textTranslated,detectedLanguage,likeCount,countries,authorId,url,platform,commentCount,...,id,index,type,issueName,themeName,manuarl_issueName,manual_themeName,Unnamed: 25,Unnamed: 26,predictedCategory
0,2023-06-08,,[object Object],en,15.0,KE,rhysoneill@gmail.com,https://www.facebook.com/CitizenTVKe/posts/pfb...,Facebook,4.0,...,d5172bb6-80a2-4f6d-b4f2-0c1be243aab1,prod-evidence-v1,_doc,COVID-19,Bio-weapon,,,,,Medical Misinformation
1,2023-06-08,,[object Object],en,19.0,KE,rhysoneill@gmail.com,https://www.facebook.com/CitizenTVKe/posts/pfb...,Facebook,5.0,...,c59961c7-bc4d-4549-8ff9-c29700709b85,prod-evidence-v1,_doc,COVID-19,Bio-weapon,,,,,Medical Misinformation
2,2023-06-08,,[object Object],en,3.0,KE,rhysoneill@gmail.com,https://www.facebook.com/CitizenTVKe/posts/pfb...,Facebook,,...,7b2a7e36-5ab2-4369-906e-f5d7b0269362,prod-evidence-v1,_doc,COVID-19,Bio-weapon,,,,,Medical Misinformation
3,2023-05-14,2023/06/08/evidence/766cd12b-1428-454f-ac14-00...,[object Object],en,43.0,TZ,rhysoneill@gmail.com,https://www.youtube.com/watch?v=Pz0hvkLYP1c,YouTube,14.0,...,766cd12b-1428-454f-ac14-008dfae4442d,prod-evidence-v1,_doc,COVID-19,Case Reporting,,,,,Medical Misinformation
4,2023-06-26,,[object Object],sw,,TZ,rhysoneill@gmail.com,http://twitter.com/TotolaMamNtilie/statuses/16...,Twitter,,...,a5dd6cc0-b55a-4598-8eec-e49fd5a16245,prod-evidence-v1,_doc,,Case Reporting,,,,,Medical Misinformation


### Randomly sample 100 Covid lies data to label

In [126]:
cl.shape

(6591, 4)

In [127]:
random = cl.sample(n=6000, random_state = 42)

In [128]:
random.shape

(6000, 4)

In [132]:
cl_list = list(cl["misconception"].unique())

In [133]:
len(cl_list)

63

In [129]:
random_df = random.drop_duplicates(subset = ["misconception"])

In [130]:
random_df.shape

(63, 4)

In [134]:
random_df.to_csv("covid_lies_sample.csv", index = False)