In [0]:
import pandas as pd
import json
import requests

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from sklearn.feature_extraction import text 

## Read in data

In [0]:
# read in data 
amp = pd.read_csv("amp_viv.csv")

# fill NAs with manual labels
amp['issueName'] = amp['issueName'].fillna(amp['manuarl_issueName'])

# check NA values
amp['issueName'].isna().sum()

In [0]:
amp.columns

In [0]:
# read in covidlies data
cl = pd.read_csv("https://raw.githubusercontent.com/ucinlp/covid19-data/master/covid_lies.csv")

In [0]:
cl["misconception_id"].value_counts()

In [0]:
cl.rename(columns={"misconception": "text" }, inplace = True)
cl["issueName"] = ""
cl["themeName"] = ""

### read in misconception labels

In [0]:
# function to read in json file from url
def read_jsonl_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for successful response (status code 200)

        json_list = []
        for line in response.iter_lines():
            # Decode each line from bytes to string
            line = line.decode("utf-8")
            if line.strip():  # Check if the line is not empty
                # Parse each line as a JSON object and add it to the list
                json_list.append(json.loads(line))
        
        return json_list
    except requests.exceptions.RequestException as e:
        print("Error occurred while fetching data:", e)
        return None
    except json.JSONDecodeError as e:
        print("Error occurred while parsing JSON data:", e)
        return None

In [0]:
# read in data file
url = "https://raw.githubusercontent.com/ucinlp/covid19-data/master/misconceptions.jsonl"
mis_data = read_jsonl_from_url(url)

In [0]:
# convert to dataframe
mis_df = pd.DataFrame(mis_data)

# unlist cateogry column
mis_df["category"] = mis_df["category"].apply(lambda x: "".join(x))

In [0]:
mis_df["category"].value_counts()

### Combine misinformation labels with covid lies data

In [0]:
mis_cl = cl.merge(mis_df, how = "left", left_on = "misconception_id", right_on = "id")

In [0]:
mis_cl["category"].value_counts()

## Combine datasets

In [0]:
df = pd.concat([amp[["text", "issueName", "themeName"]], cl[["text", "issueName", "themeName"]]])

In [0]:
df.shape

## Build model to predict issue

In [0]:
#amp.dropna(subset = ["themeName"], inplace = True)

In [0]:
# Set X and Y variables
X = mis_cl["text"]
y = mis_cl["category"]

In [0]:
# Split into train and test set
X_train, X_test, y_train, y_test= train_test_split(X, y,
                                                    stratify = y,
                                                   random_state = 42)

In [0]:
# for future if we want to add more stop words
# ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)))


# set up pipeline
pipe = Pipeline([
            ('tvec', TfidfVectorizer(ngram_range = (1,5), stop_words = list(text.ENGLISH_STOP_WORDS))),
            ("bag", BaggingClassifier(random_state = 42))

])

# param options
params = {
    "bag__n_estimators": [100], # default is 10
   # "bag__max_features": [ 1, 5, 10 , 30] # default is 1
}

# run gridsearch
gs = GridSearchCV(pipe, params, cv=5, n_jobs= 3)

In [0]:
# run model
gs.fit(X_train, y_train)

In [0]:
model = gs.best_estimator_


In [0]:
model.score(X_test, y_test)


### Predict on amp data

In [0]:
X_new = amp["text"]

In [0]:
amp["predictedCategory"] = model.predict(X_new)

In [0]:
amp["predictedCategory"].value_counts()

In [0]:
amp.head()

### Randomly sample 100 Covid lies data to label

In [0]:
cl.shape

In [0]:
random = cl.sample(n=6000, random_state = 42)

In [0]:
random.shape

In [0]:
cl_list = list(cl["misconception"].unique())

In [0]:
len(cl_list)

In [0]:
random_df = random.drop_duplicates(subset = ["misconception"])

In [0]:
random_df.shape

In [0]:
random_df.to_csv("covid_lies_sample.csv", index = False)