In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("display.max_columns" , 50)

### Read in data

In [0]:
df = pd.read_csv("amp_labels_viv.csv")

### read in covid_lies data

In [0]:
cl = pd.read_csv("covid_lies_sample.csv")

In [0]:
cl.rename(columns={"misconception": "text"}, inplace = True)

In [0]:
# combine with AMP data
df = pd.concat([df, cl], ignore_index=True)

In [0]:
def combine_non_na(row):
    return [value for value in row if pd.notna(value)]

In [0]:
# combine labels
df["labels"] = df[["themeName", "manual_themeName", "manual_themeName2"]].apply(lambda row: combine_non_na(row), axis=1)

## Find which labels are most common

In [0]:
df_counts = df["labels"].explode().value_counts()

In [0]:
df_counts.to_frame().head(8).T

## create an identifier to split if row has label or not

In [0]:
df["labels"] = df["labels"].fillna("")

In [0]:
df["split"] = "unlabeled"
mask = df["labels"].apply(lambda x: len(x)) > 0
df.loc[mask, "split"] = "labeled"

In [0]:
df["split"].value_counts()

### drop duplicates

In [0]:
len_before = len(df)
df = df.drop_duplicates(subset = "text")
print(f"Removed {(len_before - len(df))/len_before:.2%} duplicates.")

# Create training sets

In [0]:
from sklearn.preprocessing import MultiLabelBinarizer

MultiLabelBinarizer takes a list of label names and creates a vector with zeros for absent labels and ones for present labels. We can test this by fitting MultiLabelBinarizer on all_labels to learn the mapping from label name to ID

In [0]:
all_labels = list(df["themeName"].unique())

In [0]:
for theme in list(df["manual_themeName"].unique()):
    if theme not in all_labels:
        all_labels.append(theme)

In [0]:
all_labels = [x for x in all_labels if str(x) != 'nan']

In [0]:
mlb = MultiLabelBinarizer()
mlb.fit([all_labels])
mlb.transform([["Bio-weapon", "Vaccine Side Effects"], ["Home Remedies"]])

### Create splits iteratively

In [0]:
df_clean = df[["text", "labels", "split"]].reset_index(drop=True).copy()

# unsupervised set
df_unsup = df_clean.loc[df_clean["split"] == "unlabeled", ["text", "labels"]]

# supervised set
df_sup = df_clean.loc[df_clean["split"] == "labeled", ["text", "labels"]]

### Create a dataset so all these splits are in one

In [0]:
from datasets import Dataset, DatasetDict

ds = DatasetDict({
    "sup": Dataset.from_pandas(df_sup.reset_index(drop=True)),
    "unsup": Dataset.from_pandas(df_unsup.reset_index(drop=True))})


In [0]:
ds

### Create training slides to investigate what's the right balance of supervised to unsupervised data needed

# Use embeddings as a lookup table

In [0]:
import torch
from transformers import AutoTokenizer, AutoModel

In [0]:
model_ckpt = "miguelvictor/python-gpt2-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

def mean_pooling(model_output, attention_mask):
    # Extract the token embeddings
    token_embeddings = model_output[0]
    # Compute the attention mask
    input_mask_expanded = (attention_mask
                           .unsqueeze(-1)
                           .expand(token_embeddings.size())
                           .float())
    # Sum the embeddings, but ignore masked tokens
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    # Return the average as a single vector
    return sum_embeddings / sum_mask

def embed_text(examples):
    inputs = tokenizer(examples["text"], padding=True, truncation=True,
                       max_length=128, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
    return {"embedding": pooled_embeds.cpu().numpy()}

## Figuring out the type for variables

In [0]:
inputs = tokenizer(ds["sup"]["text"], padding=True, truncation=True,
                       max_length=128, return_tensors="pt")

In [0]:
type(inputs["attention_mask"])

In [0]:
with torch.no_grad():
    model_output = model(**inputs)

In [0]:
type(model_output[0])

### Get embedding for each split

In [0]:
tokenizer.pad_token = tokenizer.eos_token


In [0]:
embs_train = ds["sup"].map(embed_text, batched=True, batch_size=16)

In [0]:
embs_test = ds["unsup"].map(embed_text, batched=True, batch_size=16)

## Write to pickle to save

In [0]:
import pickle

In [0]:
embs_train_file = open("amp_embs_labels", "ab")
embs_test_file = open("amp_embs_test", "ab")

In [0]:
embs_train

### For when we want to load data back in

In [0]:
embs_train_file = open("amp_embs_labels", "rb")
embs_test_file = open("amp_embs_test", "rb")

In [0]:
embs_train = pickle.load(embs_train_file)
embs_test = pickle.load(embs_test_file)

## Install Faiss

In [0]:
pip install faiss-gpu

In [0]:
import faiss

## Remove the fiass embedding so we can compare to NB

In [0]:
test_queries = np.array(embs_test["embedding"], dtype=np.float32)

In [0]:
embs_train.add_faiss_index("embedding")

In [0]:
_, samples = embs_train.get_nearest_examples_batch("embedding", test_queries, k = 4)

In [0]:
len(samples)

In [0]:
len(y_pred)

In [0]:
samples[0]["text"]

In [0]:
samples[0]["labels"]

In [0]:
def get_sample_preds(sample):
    return sample["labels"][0:2]

In [0]:
y_pred = [get_sample_preds(s) for s in samples]

In [0]:
y_pred[64]

In [0]:
predictions = pd.DataFrame({"text": embs_test["text"],
             "themeName": y_pred})

In [0]:
predictions

In [0]:
predictions.to_csv("few_shot_predictions.csv", index = False)

# Try getting prediction for single input

In [0]:
sample_text = "CBD oil is a cure for COVID-19."

In [0]:
def embed_single_text(text):
    inputs = tokenizer(text, padding=True, truncation=True,
                       max_length=128, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
    return {"embedding": pooled_embeds.cpu().numpy()}

In [0]:
embs_sample = embed_single_text(sample_text)

In [0]:
scores, sample = embs_train.get_nearest_examples_batch("embedding", embs_sample["embedding"], k = 4)

In [0]:
sample[0]["labels"]