# Zero shot classification

## [DeBerta-v3-base-mnli-fever-anli](https://huggingface.co/MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli?candidateLabels=covid-19%2C+vaccine+efficacy%2C+vaccine+side+effects%2C+measles%2C+cholera%2C+home+remedies%2C+conspiracy&multiClass=true&text=actor+jamie+foxx+reportedly+par+lyzed+and+bl+nd+due+to+covid+19+vaccine+american+actor+jamie+foxx+has+reportedly+s+ffered+partial+par+lysis+following+a+covid+19+vaccine+complication+the+vaccine+according+to+hollywood+journalist+a+j+benza+resulted+in+a+bl+d+clot+in+his+brain+that+led+to+him+being+partially+par+lyzed+and+bl+nd+in+addition+to+a+series+of+other+complications+benza+claims+the+news+was+confirmed+by+a+source+close+to+foxx+jamie+had+a+bl+d+clot+in+his+brain+after+he+got+the+sh+t+he+did+not+want+the+sh+t+but+the+movie+he+was+on+he+was+pressured+to+get+it+confessed+the+journalist+who+formally+worked+as+a+columnist+for+the+new+york+daily+news+during+an+appearance+on+dr+drew+pinsky+s+online+show+ask+dr+drew+the+bl+d+clot+in+the+brain+caused+him+at+that+point+to+be+partially+par+lyzed+and+bl+nd+benza+alleged+as+he+insisted+his+insider+was+someone+in+the+room+with+first+hand+knowledge+of+foxx+s+hospitalization+foxx+was+admitted+to+the+hospital+after+s+ffering+a+medical+complication+on+thursday+april+11+according+to+his+daughter+corinne+foxx+luckily+due+to+quick+action+and+great+care+he+is+already+on+his+way+to+recovery+we+know+how+beloved+he+is+and+appreciate+your+prayers+the+family+asks+for+privacy+during+this+time+she+said+the+55+year+old+s+medical+woes+began+while+filming+his+latest+movie+back+in+action+the+movie+also+starring+cameron+diaz+is+said+to+have+moved+forward+with+production+without+foxx+using+a+body+double+in+his+place)

In [0]:
import mlflow
from mlflow.entities.run import Run
from mlflow.tracking.client import MlflowClient, ModelVersion
from mlflow.utils import mlflow_tags
import pandas as pd
import re
import json
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score, jaccard_score
from collections import defaultdict
# load functions
with open('project_config.json','r') as fp: 
    project_config = json.load(fp)
 
module_path = os.path.join(project_config['project_module_relative_path'])
sys.path.append(module_path)
 
from data_processing import *

pd.set_option("display.max_columns" , 50)

In [0]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")

### Read in data

In [0]:
# read in data export
df = pd.read_pickle("./model_training_data_vax.pkl")
df=df[df["split"]== "labeled"]
df["text"] = df["text"].fillna(df["textTranslated.en"])
df.dropna(subset = "text", inplace = True)

In [0]:
def map_themes(themes, theme_dict):
    if isinstance(themes, list):
        big_themes = []
        for theme in themes:
            if theme is not None:
                for key, values in theme_dict.items():
                    if theme in values:
                        big_themes.append(key)

                        
                        break
        return big_themes if big_themes else None
    return None

## Write function to classify text and return a df

In [0]:
def classify_themes(df, theme_labels):
    result_list = []
    for index, row in df.iterrows():
        text_sequence = row['text']
        result = classifier(text_sequence, theme_labels, multi_label = True)
        result['themeIds'] = result['labels']
        result['themeConfidence'] = result['scores']
        result["id"] = row["id"]
        result["text"] = row["text"]
        result_list.append(result)
    result_df = pd.DataFrame(result_list)[['id', 'text', 'themeIds', 'themeConfidence']]
    result_df= pd.merge(result_df, df[["id", "themeIdsReviewedParent"]], how = "left", on = "id")
    return result_df

In [0]:
theme_labels = df['themeIdsParent'].explode().unique().tolist()
theme_labels = [x for x in theme_labels if x is not None]

In [0]:
def mlb_transform(exp_type, df, theme_dict):
    if "vax" in exp_type:
        themes = ['prevention-treatment-approved', 'prevention-treatment-alternative', 'illness-cause', 'intervention-capacity', 'conspiracy-corruption', 'vaccines']
    else:
        themes = ['prevention-treatment-approved', 'prevention-treatment-alternative', 'illness-cause', 'intervention-capacity', 'conspiracy-corruption']

    mlb = MultiLabelBinarizer()
    mlb.fit([themes])

    if exp_type == "simplified" or exp_type == "simplified_vax":
        pred_col="themeIdsParent"
    else:
        df["themeIds"] = df["themeIds"].apply(lambda x: map_themes(x, theme_dict))
        pred_col = "themeIds"

    df.dropna(subset=[pred_col, "themeIdsReviewedParent"], inplace = True)

    
    y_pred = mlb.transform(df[pred_col])
    y_true = mlb.transform(df["themeIdsReviewedParent"])

    # get scores
    macro_score = f1_score(y_true, y_pred, average='macro')
    micro_score = f1_score(y_true, y_pred, average='micro')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    class_names = mlb.classes_
    jaccard_scores = jaccard_score(y_true,y_pred, average =None)

    return macro_score, micro_score, precision, recall, class_names, jaccard_scores

In [0]:
# get theme threshold
def filter_threshold(df, threshold):
    for index, row in df.iterrows():
        confidences = row["themeConfidence"]
        theme_ids = row["themeIds"]

        # Ensure lengths are the same
        min_len = min(len(confidences), len(theme_ids))
        filtered_confidences = [confidences[i] if confidences[i] > threshold else None for i in range(min_len)]
        filtered_theme_ids = [theme_ids[i] if confidences[i] > threshold else None for i in range(min_len)]
        df.at[index, "themeConfidence"] = filtered_confidences
        df.at[index, "themeIds"] = filtered_theme_ids

    return df

In [0]:
def filter_rows_with_empty_column(df, column_name):
    filtered_df = df[df.apply(lambda row: len(row[column_name]) == 0, axis=1)]
    return filtered_df

# Write Experiment

In [0]:
def run_taxonomy_experiment(exp_type, theme_labels, theme_dict):
    # set up experiment
    mlflow_client = MlflowClient()
    exp_name ="/Users/vpeng@rockfound.org/taxonomy_experiment"
    exp = mlflow_client.get_experiment_by_name(exp_name)
    mlflow.set_experiment(exp_name)
    run_name = "simplified_taxonomy_run"
    parent_run = mlflow.start_run(run_name = run_name, nested = False)

    # get themes
    if exp_type == "simplified" or exp_type == "simplified_vax":
        themes_df = df
    else:
        themes_df = classify_themes(df, theme_labels)
        # filter themes
        themes_df = filter_threshold(themes_df, 0.79)

    # get scores
    macro_score, micro_score, precision, recall, class_names, jaccard_scores = mlb_transform(exp_type, themes_df, theme_dict)
    jaccard_dict = dict(zip(class_names, jaccard_scores))
    true_values = themes_df["themeIdsReviewedParent"].explode().value_counts(normalize = True).to_dict()
    true_dict = {key + "-true": value for key, value in true_values.items()}

    # run experiment
    metrics = {"macro_f1": macro_score, "micro_f1": micro_score, "precision": precision, "recall": recall}
    metrics.update(jaccard_dict, **true_dict)
    mlflow.log_params({"experiment_type": exp_type})
    mlflow.log_metrics(metrics)    
    
    # end run
    mlflow.end_run()

In [0]:
theme_dict = {
    "conspiracy-corruption": ["bioweapon, conspiracy, corruption, media-bias, medical-exploitation"],
    "illness-cause": ["stigmatization, case-reporting, symptoms-severity, variants"],
    "intervention-capacity":["public health capacity"],
    "prevention-treatment-alternative": ["alternative cures, religious practices"],
    "prevention-treatment-approved": ["prevention, collective prevention, individual treatment, vaccine efficacy, vaccine side effects"]
}

In [0]:
mlflow.end_run()
run_taxonomy_experiment("simplified", theme_labels, theme_dict)

In [0]:
theme_labels

In [0]:
mlflow.end_run()
run_taxonomy_experiment("simplified_vax", theme_labels, theme_dict_vax)

# For when you need to troubleshoot

In [0]:
# read in data export
df = pd.read_pickle("./model_training_data.pkl")
df=df[df["split"]== "labeled"]
df["text"] = df["text"].fillna(df["textTranslated.en"])
df.dropna(subset = "text", inplace = True)
print(df.shape)

In [0]:
#themes_df = df
themes = ['prevention-treatment-approved', 'prevention-treatment-alternative', 'illness-cause', 'intervention-capacity', 'conspiracy-corruption']

In [0]:
mlb = MultiLabelBinarizer()
mlb.fit([themes])

In [0]:
y_pred = mlb.transform(df[pred_col])
y_true = mlb.transform(df["themeIdsReviewedParent"])

In [0]:
macro_score, micro_score, precision, recall, class_names, jaccard_scores = mlb_transform("simplified", df, theme_dict)

# Run experiment with child themes

In [0]:
child_labels = ["alternative cures, religious practices", "bioweapon, conspiracy, corruption, media-bias, medical-exploitation", "stigmatization, case-reporting, symptoms-severity, variants", "public health capacity", "prevention, collective prevention, individual treatment, vaccine efficacy, vaccine side effects"]

In [0]:
#no_predictions = filter_rows_with_empty_column(test2, "themeIds")

In [0]:
mlflow.end_run()
run_taxonomy_experiment("child_labels", child_labels, theme_dict)

#### Child themes - Vaccines

In [0]:
child_labels_vax = ["alternative cures, religious practices", "bioweapon, conspiracy, corruption, media-bias, medical-exploitation", "stigmatization, case-reporting, symptoms-severity, variants", "public health capacity", "prevention, collective prevention, individual treatment", "vaccine efficacy, vaccine side effects"]

theme_dict_vax = {
    "conspiracy-corruption": ["bioweapon, conspiracy, corruption, media-bias, medical-exploitation"],
    "illness-cause": ["stigmatization, case-reporting, symptoms-severity, variants"],
    "intervention-capacity":["public health capacity"],
    "prevention-treatment-alternative": ["alternative cures, religious practices"],
    "prevention-treatment-approved": ["prevention, collective prevention, individual treatment"],
    "vaccines": ["vaccine efficacy, vaccine side effects"]
}

In [0]:
mlflow.end_run()
run_taxonomy_experiment("child_labels_vax", child_labels_vax, theme_dict_vax)

# Run experiment with extra words

In [0]:
extra_labels = ["alternative cures, herbal remedies, home remedies, healers and healing, religious belief, religious leader, cultural practices", "biological weapon, chemical agent, nefarious plots, economic exploitation, profiteering, extortion, media slant and bias, fake news, medical exploitation, experimental treatments, expired medicine, guinea pigs", "stigmatization, case reporting, symptoms severity, disease variants, disease genetic modifications", "capacity of public health system (hospitals, doctors, governments, aid)", "collective prevention, lockdowns, travel bans, travel restrictions, individual prevention, non-pharmaceutical interventions, quarantine, face masks, hand washing, vaccine side effects, vaccine efficacy vaccines"]

In [0]:
theme_dict_extra = {
    "conspiracy-corruption": "biological weapon, chemical agent, nefarious plots, economic exploitation, profiteering, extortion, media slant and bias, fake news, medical exploitation, experimental treatments, expired medicine, guinea pigs",
    "illness-cause": "stigmatization, case reporting, symptoms severity, disease variants, disease genetic modifications",
    "intervention-capacity":"capacity of public health system (hospitals, doctors, governments, aid)",
    "prevention-treatment-alternative": "alternative cures, herbal remedies, home remedies, healers and healing, religious belief, religious leader, cultural practices",
    "prevention-treatment-approved": "collective prevention, lockdowns, travel bans, travel restrictions, individual prevention, non-pharmaceutical interventions, quarantine, face masks, hand washing, vaccine side effects, vaccine efficacy vaccines"
}

In [0]:
mlflow.end_run()
run_taxonomy_experiment("extra_labels", extra_labels, theme_dict_extra)

# Run experiment with new key words

In [0]:
key_words = ["herbal remedies, home remedies, healers, religious practices, religious beliefs", "conspiracy, corruption, bioweapon, exploitation, exhortion, profiteering, fake news, bias", "case reporting, disease symptoms, disease transmission, disease severity, disease lethality, disease variants, stigmatization","hospital capacity, government capacity, interventions restrictions, lockdowns", "disease treatment, medicine, vaccines, vaccine safety, vaccine side effects, vaccine efficacy"]

In [0]:
theme_dict_key = {
    "prevention-treatment-alternative": "herbal remedies, home remedies, healers, religious practices, religious beliefs",
    "conspiracy-corruption": "conspiracy, corruption, bioweapon, exploitation, exhortion, profiteering, fake news, bias",
    "illness-cause": "case reporting, disease symptoms, disease transmission, disease severity, disease lethality, disease variants, stigmatization",
    "intervention-capacity":"hospital capacity, government capacity, interventions restrictions, lockdowns",
    "prevention-treatment-approved": "disease treatment, medicine, vaccines, vaccine safety, vaccine side effects, vaccine efficacy"
}

In [0]:
mlflow.end_run()
run_taxonomy_experiment("new_key_words", key_words, theme_dict_key)

#### New key words - vax

In [0]:
key_words_vax = ["herbal remedies, home remedies, healers, religious practices, religious beliefs", "conspiracy, corruption, bioweapon, exploitation, exhortion, profiteering, fake news, bias", "case reporting, disease symptoms, disease transmission, disease severity, disease lethality, disease variants, stigmatization","hospital capacity, government capacity, interventions restrictions, lockdowns", "disease treatment, medicine", "vaccines, vaccine safety, vaccine side effects, vaccine efficacy"]

theme_dict_key_vax = {
    "prevention-treatment-alternative": "herbal remedies, home remedies, healers, religious practices, religious beliefs",
    "conspiracy-corruption": "conspiracy, corruption, bioweapon, exploitation, exhortion, profiteering, fake news, bias",
    "illness-cause": "case reporting, disease symptoms, disease transmission, disease severity, disease lethality, disease variants, stigmatization",
    "intervention-capacity":"hospital capacity, government capacity, interventions restrictions, lockdowns",
    "prevention-treatment-approved": "disease treatment, medicine",
    "vaccines": "vaccines, vaccine safety, vaccine side effects, vaccine efficacy"
}

In [0]:
mlflow.end_run()
run_taxonomy_experiment("new_key_words_vax", key_words_vax, theme_dict_key_vax)

# Run experiment with NLP

In [0]:
nlp_words = ["Discussion of herbal remedies, alternative cures, religious practices, healers, and other home cures for disease prevention and treatment", "Discussion of conspiracy, corruption, exploitation, profiteering, fake news, bias and other expressions of distrust in public health", "Discussion of disease outbreaks, symptions, severity, tranmissions, variants and stigmatization", "Discussion of hospital capacity, government capacity, interventions, restrictions, lockdowns, and general ability of the public health system to meet needs", "Discussion of medical treatment for diseases including medicine and procedures proscribed by a medical professional. Discussion of vaccines including vaccine safet, side effects, and efficacy."]

In [0]:
theme_dict_nlp = {
    "prevention-treatment-alternative": "Discussion of herbal remedies, alternative cures, religious practices, healers, and other home cures for disease prevention and treatment",
    "conspiracy-corruption": "Discussion of conspiracy, corruption, exploitation, profiteering, fake news, bias and other expressions of distrust in public health",
    "illness-cause": "Discussion of disease outbreaks, symptions, severity, tranmissions, variants and stigmatization",
    "intervention-capacity":"Discussion of hospital capacity, government capacity, interventions, restrictions, lockdowns, and general ability of the public health system to meet needs",
    "prevention-treatment-approved": "Discussion of medical treatment for diseases including medicine and procedures proscribed by a medical professional. Discussion of vaccines including vaccine safet, side effects, and efficacy"
}

In [0]:
mlflow.end_run()
run_taxonomy_experiment("nlp_words", nlp_words, theme_dict_nlp)

#### Try with NLP Vax

In [0]:
nlp_words_vax = ["Discussion of herbal remedies, alternative cures, religious practices, healers, and other home cures for disease prevention and treatment", "Discussion of conspiracy, corruption, exploitation, profiteering, fake news, bias and other expressions of distrust in public health", "Discussion of disease outbreaks, symptions, severity, tranmissions, variants and stigmatization", "Discussion of hospital capacity, government capacity, interventions, restrictions, lockdowns, and general ability of the public health system to meet needs", "Discussion of medical treatment for diseases including medicine and procedures proscribed by a medical professional", "Discussion of vaccines including vaccine safet, side effects, and efficacy"]

In [0]:
theme_dict_nlp_vax = {
    "prevention-treatment-alternative": "Discussion of herbal remedies, alternative cures, religious practices, healers, and other home cures for disease prevention and treatment",
    "conspiracy-corruption": "Discussion of conspiracy, corruption, exploitation, profiteering, fake news, bias and other expressions of distrust in public health",
    "illness-cause": "Discussion of disease outbreaks, symptions, severity, tranmissions, variants and stigmatization",
    "intervention-capacity":"Discussion of hospital capacity, government capacity, interventions, restrictions, lockdowns, and general ability of the public health system to meet needs",
    "prevention-treatment-approved": "Discussion of medical treatment for diseases including medicine and procedures proscribed by a medical professional",
    "vaccines": "Discussion of vaccines including vaccine safet, side effects, and efficacy"
}

In [0]:
mlflow.end_run()
run_taxonomy_experiment("nlp_words_vax", nlp_words_vax, theme_dict_nlp_vax)