In [2]:
pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import faiss
import numpy as np
import pickle
import torch
from transformers import AutoTokenizer, AutoModel

# Use embeddings as a lookup table

In [2]:
# load in model
model_ckpt = "miguelvictor/python-gpt2-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/792 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


Downloading:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

Some weights of the model checkpoint at miguelvictor/python-gpt2-large were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def mean_pooling(model_output, attention_mask):
    # Extract the token embeddings
    token_embeddings = model_output[0]
    # Compute the attention mask
    input_mask_expanded = (attention_mask
                           .unsqueeze(-1)
                           .expand(token_embeddings.size())
                           .float())
    # Sum the embeddings, but ignore masked tokens
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    # Return the average as a single vector
    return sum_embeddings / sum_mask

## Load in training embedding

In [3]:
import pickle

In [4]:
# load in embedding from labeled data
embs_train_file = open("amp_embs_labels", "rb")
embs_train = pickle.load(embs_train_file)

# add faiss index to get nearnest neighbors
embs_train.add_faiss_index("embedding")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'labels', 'embedding'],
    num_rows: 60
})

In [7]:
type(embs_train)

datasets.arrow_dataset.Dataset

# Try getting prediction for single input

In [5]:
sample_text = "CBD oil is a cure for COVID-19."

In [6]:
def embed_single_text(text):
    inputs = tokenizer(text, padding=True, truncation=True,
                       max_length=128, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
    return {"embedding": pooled_embeds.cpu().numpy()}

In [7]:
def get_predicted_labels(text):
    embs_sample = embed_single_text(text)
    scores, sample = embs_train.get_nearest_examples_batch("embedding", embs_sample["embedding"], k = 4)
    return sample[0]["labels"]

In [8]:
predicted_themes = get_predicted_labels(sample_text)

In [9]:
predicted_themes

[['Disease Risk'],
 ['Case Reporting'],
 ['Vaccine Side Effects'],
 ['Vaccine Efficacy']]

In [10]:
get_predicted_labels("Vitamin C is all you need to cure Covid")

[['Vaccine Side Effects'], ['Conspiracy'], ['Disease Risk'], ['Home Remedy']]

#### This method will skew towards our existing labels. So if we have unbalanced classes, it will likely cluster to labels where we have a lot of examples in our training dataset.

# Check label counts in training data

In [29]:
import pandas as pd

In [31]:
df = pd.read_csv("amp_labels_viv.csv")

In [33]:
df["themeName"].value_counts()

Vaccine Side Effects               19
Conspiracy                          7
Vaccine Efficacy                    7
Home Remedies                       6
Bio-weapon                          4
Case Reporting                      4
Treatment                           2
Variants                            2
Media Bias                          2
Traditional/Religious Practices     1
Stigmatization                      1
Scientific disbelief                1
Government                          1
Corruption                          1
Disease Risk                        1
Home Remedy                         1
Name: themeName, dtype: int64

In [34]:
df["manual_themeName"].value_counts()

Conspiracy         7
Government         2
Case Reporting     2
Bio-weapon         1
Pharma distrust    1
Racism             1
Name: manual_themeName, dtype: int64