In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("display.max_columns" , 50)

### Read in data

In [2]:
df = pd.read_csv("amp_labels_viv.csv")

### read in covid_lies data

In [3]:
cl = pd.read_csv("covid_lies_sample.csv")

In [4]:
cl.rename(columns={"misconception": "text"}, inplace = True)

In [5]:
# combine with AMP data
df = pd.concat([df, cl], ignore_index=True)

In [6]:
def combine_non_na(row):
    return [value for value in row if pd.notna(value)]

In [7]:
# combine labels
df["labels"] = df[["themeName", "manual_themeName", "manual_themeName2"]].apply(lambda row: combine_non_na(row), axis=1)

## Find which labels are most common

In [8]:
df_counts = df["labels"].explode().value_counts()

In [9]:
df_counts.to_frame().head(8).T

Unnamed: 0,Vaccine Side Effects,Conspiracy,Vaccine Efficacy,Home Remedies,Case Reporting,Bio-weapon,Vaccine ingredients,Government
labels,19,14,7,6,6,5,3,3


## create an identifier to split if row has label or not

In [10]:
df["labels"] = df["labels"].fillna("")

In [11]:
df["split"] = "unlabeled"
mask = df["labels"].apply(lambda x: len(x)) > 0
df.loc[mask, "split"] = "labeled"

In [12]:
df["split"].value_counts()

unlabeled    65
labeled      60
Name: split, dtype: int64

### drop duplicates

In [13]:
len_before = len(df)
df = df.drop_duplicates(subset = "text")
print(f"Removed {(len_before - len(df))/len_before:.2%} duplicates.")

Removed 0.00% duplicates.


# Create training sets

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer

MultiLabelBinarizer takes a list of label names and creates a vector with zeros for absent labels and ones for present labels. We can test this by fitting MultiLabelBinarizer on all_labels to learn the mapping from label name to ID

In [15]:
all_labels = list(df["themeName"].unique())

In [16]:
for theme in list(df["manual_themeName"].unique()):
    if theme not in all_labels:
        all_labels.append(theme)

In [17]:
all_labels = [x for x in all_labels if str(x) != 'nan']

In [18]:
mlb = MultiLabelBinarizer()
mlb.fit([all_labels])
mlb.transform([["Bio-weapon", "Vaccine Side Effects"], ["Home Remedies"]])

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

### Create splits iteratively 

In [19]:
df_clean = df[["text", "labels", "split"]].reset_index(drop=True).copy()

# unsupervised set
df_unsup = df_clean.loc[df_clean["split"] == "unlabeled", ["text", "labels"]]

# supervised set
df_sup = df_clean.loc[df_clean["split"] == "labeled", ["text", "labels"]]

### Create a dataset so all these splits are in one

In [20]:
from datasets import Dataset, DatasetDict

ds = DatasetDict({
    "sup": Dataset.from_pandas(df_sup.reset_index(drop=True)),
    "unsup": Dataset.from_pandas(df_unsup.reset_index(drop=True))})


In [21]:
ds

DatasetDict({
    sup: Dataset({
        features: ['text', 'labels'],
        num_rows: 60
    })
    unsup: Dataset({
        features: ['text', 'labels'],
        num_rows: 65
    })
})

### Create training slides to investigate what's the right balance of supervised to unsupervised data needed

# Use embeddings as a lookup table

In [22]:
import torch
from transformers import AutoTokenizer, AutoModel

In [23]:
model_ckpt = "miguelvictor/python-gpt2-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

def mean_pooling(model_output, attention_mask):
    # Extract the token embeddings
    token_embeddings = model_output[0]
    # Compute the attention mask
    input_mask_expanded = (attention_mask
                           .unsqueeze(-1)
                           .expand(token_embeddings.size())
                           .float())
    # Sum the embeddings, but ignore masked tokens
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    # Return the average as a single vector
    return sum_embeddings / sum_mask

def embed_text(examples):
    inputs = tokenizer(examples["text"], padding=True, truncation=True,
                       max_length=128, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
    return {"embedding": pooled_embeds.cpu().numpy()}

Some weights of the model checkpoint at miguelvictor/python-gpt2-large were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Figuring out the type for variables

In [26]:
inputs = tokenizer(ds["sup"]["text"], padding=True, truncation=True,
                       max_length=128, return_tensors="pt")

In [30]:
type(inputs["attention_mask"])

torch.Tensor

In [27]:
with torch.no_grad():
    model_output = model(**inputs)

In [28]:
type(model_output[0])

torch.Tensor

### Get embedding for each split

In [24]:
tokenizer.pad_token = tokenizer.eos_token


In [134]:
embs_train = ds["sup"].map(embed_text, batched=True, batch_size=16)

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [65]:
embs_test = ds["unsup"].map(embed_text, batched=True, batch_size=16)

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

## Write to pickle to save

In [67]:
import pickle

In [136]:
embs_train_file = open("amp_embs_labels", "ab")
embs_test_file = open("amp_embs_test", "ab")

In [139]:
embs_train

Dataset({
    features: ['text', 'labels', 'embedding'],
    num_rows: 60
})

### For when we want to load data back in

In [3]:
embs_train_file = open("amp_embs_labels", "rb")
embs_test_file = open("amp_embs_test", "rb")

In [4]:
embs_train = pickle.load(embs_train_file)
embs_test = pickle.load(embs_test_file)

## Install Faiss

In [44]:
pip install faiss-gpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import faiss

## Remove the fiass embedding so we can compare to NB

In [71]:
test_queries = np.array(embs_test["embedding"], dtype=np.float32)

In [135]:
embs_train.add_faiss_index("embedding")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'labels', 'embedding'],
    num_rows: 60
})

In [72]:
_, samples = embs_train.get_nearest_examples_batch("embedding", test_queries, k = 4)

In [76]:
len(samples)

65

In [79]:
len(y_pred)

65

In [83]:
samples[0]["text"]

['rt djlange lisa_iannattone thats a misleading headline statistically healthy children have more chance of dying from a lightning strike than covid you must know this shame on you',
 'hi alai im a practitioner at a certain govt hospital in kisumu please hide my identity over the past 3weeks i have diagonised dozen of patients with similar symptoms akin to that of covid 19 but has mutated to a new strain im sorry to say that covid 19 is back and the govt is silent',
 'so they want to bring down museveni for opposing homosexuality in uganda one day one time russia and china will bring down america history is looming',
 'the us now wants to finish him because he rejected the ushoga system god will protect museveni']

In [84]:
samples[0]["labels"]

[['Disease Risk'],
 ['Variants', 'Case Reporting'],
 ['Bio-weapon'],
 ['Bio-weapon']]

In [88]:
def get_sample_preds(sample):
    return sample["labels"][0:2]

In [89]:
y_pred = [get_sample_preds(s) for s in samples]

In [90]:
y_pred[64]

[['Vaccine Efficacy'],
 ['Vaccine Side Effects', 'Conspiracy', 'Vaccine ingredient']]

In [97]:
predictions = pd.DataFrame({"text": embs_test["text"],
             "themeName": y_pred})

In [98]:
predictions

Unnamed: 0,text,themeName
0,i agree with draseemmalhotra all should be rem...,"[[Disease Risk], [Variants, Case Reporting]]"
1,lhistoire du covid était une gigantesque opéra...,"[[Conspiracy], [Vaccine Efficacy]]"
2,Fennel tea cures coronavirus.,"[[Vaccine Side Effects, Conspiracy, Vaccine in..."
3,Actor Keith Lawrence Middlebrook has a cure fo...,"[[Vaccine Side Effects, Conspiracy, Vaccine in..."
4,A decade-old map published by the World Popula...,"[[Conspiracy, Bio-weapon], [Case Reporting]]"
...,...,...
60,Using namaste as a greeting prevents transmiss...,"[[Conspiracy], [Disease Risk]]"
61,Cocaine cures coronavirus.,"[[Vaccine Efficacy], [Vaccine Side Effects, Co..."
62,Africans are more resistant to coronavirus.,"[[Conspiracy], [Case Reporting]]"
63,Methanol consumption cures or protects against...,"[[Case Reporting], [Vaccine Side Effects]]"


In [99]:
predictions.to_csv("few_shot_predictions.csv", index = False)

# Try getting prediction for single input

In [100]:
sample_text = "CBD oil is a cure for COVID-19."

In [107]:
def embed_single_text(text):
    inputs = tokenizer(text, padding=True, truncation=True,
                       max_length=128, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
    return {"embedding": pooled_embeds.cpu().numpy()}

In [108]:
embs_sample = embed_single_text(sample_text)

In [132]:
scores, sample = embs_train.get_nearest_examples_batch("embedding", embs_sample["embedding"], k = 4)

In [133]:
sample[0]["labels"]

[['Disease Risk'],
 ['Case Reporting'],
 ['Vaccine Side Effects'],
 ['Vaccine Efficacy']]