In [1]:
import pandas as pd
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModel
from scipy.stats import qmc
import torch
from nltk.tokenize import word_tokenize
import nltk
import sobol_seq
from vllm import LLM, SamplingParams

  from tqdm.autonotebook import tqdm, trange


## processing Medical abstract dataset

In [2]:
df_test = pd.read_csv("../data/raw/Medical_Abstracts_TC/medical_tc_test.csv")#, header = 0
df_test.head()

Unnamed: 0,condition_label,medical_abstract
0,3,Obstructive sleep apnea following topical orop...
1,5,Neutrophil function and pyogenic infections in...
2,5,A phase II study of combined methotrexate and ...
3,1,Flow cytometric DNA analysis of parathyroid tu...
4,4,Paraneoplastic vasculitic neuropathy: a treata...


In [3]:
df_label = pd.read_csv("../data/raw/Medical_Abstracts_TC/medical_tc_labels.csv")# header = 0
df_label.head()

Unnamed: 0,condition_label,condition_name
0,1,neoplasms
1,2,digestive system diseases
2,3,nervous system diseases
3,4,cardiovascular diseases
4,5,general pathological conditions


In [4]:
## joining both the datasets
df_test_label = pd.merge(df_test, df_label, on='condition_label', how='left')
print(df_test_label.shape)
df_test_label.drop(columns=['condition_label'] ,axis = 1, inplace = True)
print(df_test_label.shape)
df_test_label.head()

print(df_test_label.columns)
df_test_label = df_test_label.head(100)

(2888, 3)
(2888, 2)
Index(['medical_abstract', 'condition_name'], dtype='object')


In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')

# Function to count tokens
def count_tokens(text):
    tokens = word_tokenize(text)
    return len(tokens)

# Apply the function and create a new column
df_test_label['abstract_token_count'] = df_test_label['medical_abstract'].apply(count_tokens)

# Display the DataFrame
print(df_test_label)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/nyarrabolu_umass_edu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/nyarrabolu_umass_edu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                     medical_abstract  \
0   Obstructive sleep apnea following topical orop...   
1   Neutrophil function and pyogenic infections in...   
2   A phase II study of combined methotrexate and ...   
3   Flow cytometric DNA analysis of parathyroid tu...   
4   Paraneoplastic vasculitic neuropathy: a treata...   
..                                                ...   
95  Transient ischaemic attacks in young patients:...   
96  Incidence of perioperative myocardial ischemia...   
97  Crohn's disease in the city of Derby, 1951-85....   
98  Caffeine and cardiac arrhythmias. PURPOSE: To ...   
99  The surgical treatment of atrial fibrillation....   

                     condition_name  abstract_token_count  
0           nervous system diseases                   302  
1   general pathological conditions                   285  
2   general pathological conditions                   300  
3                         neoplasms                   315  
4           car

In [6]:
df_test_label.to_csv("../data/processed/Medical_Abstract_Processed.csv")

In [7]:
df_test_label_modified = df_test_label[df_test_label["abstract_token_count"] <= 100]
print(df_test_label_modified["condition_name"].unique())
print(df_test_label_modified.shape)
# print(df_test_label_modified)

['cardiovascular diseases' 'general pathological conditions'
 'nervous system diseases' 'neoplasms']
(12, 3)


## K_Shot Implementation

In [8]:
df_test_label_modified["condition_name"].unique()

array(['cardiovascular diseases', 'general pathological conditions',
       'nervous system diseases', 'neoplasms'], dtype=object)

In [9]:
def sobol_sampling_embedding(df: pd.DataFrame, k: int, model_name: str) -> list:
    """
    Perform quasi-random Sobol sampling on sentence embeddings to select k rows.

    Args:
        df (pd.DataFrame): DataFrame containing `medical_abstract` and `condition_name` columns.
        k (int): The number of samples needed.

    Returns:
        list: A list of tuples in the format (text, label), sampled quasi-randomly.
    """
    model = SentenceTransformer(model_name)

    # Step 1: Generate embeddings for the 'medical_abstract' column
    texts = df['medical_abstract'].tolist()
    embeddings = model.encode(texts, convert_to_tensor=True)

    # Step 2: Sobol sequence sampling to pick indices
    sobol_indices = sobol_seq.i4_sobol_generate(1, k).flatten()
    selected_indices = (sobol_indices * len(embeddings)).astype(int)

    # Step 3: Select k samples based on the Sobol indices
    sampled_data = []
    for idx in selected_indices:
        text = df.iloc[idx]['medical_abstract']
        label = df.iloc[idx]['condition_name']
        sampled_data.append((text, label))

    return sampled_data

In [10]:
data = df_test_label_modified.copy()
data['actual_condition_name'] = data['condition_name']
condition_names = data["condition_name"].unique()
print(condition_names)

condition_names_str = ", ".join(condition_names)


prompt = (f"For Medical Abstract condition name Classification: Your task is to identify the condition name on a given medical abstract text and provide strictly a single word indicating"
f"whether the condition is one of the following: {condition_names_str}. The medical abstract may contain any language or style of writing. Please ensure that your analysis takes into account the overall"
"tone and context of the text. Your response should be concise and clear, providing a single word that accurately reflects the condition name of the given medical abstract."
"If there are multiple condition names present in the text, please choose the one that best represents the medical abstract conveyed by the author. Please note"
"that your analysis should take into account all relevant factors, such as tone, langauge use, and content. Your response should also be flexible enough to allow for various types"
"of input texts. ")


data['main_prompt'] = data.apply(lambda x: f"Prompt: {prompt}\nnMedical_Abstract : {x['medical_abstract']}\nCondition_Name:", axis=1)

medical_abstract_and_condition_name = sobol_sampling_embedding(data, k = 5,model_name = "sentence-transformers/all-mpnet-base-v2")
print(medical_abstract_and_condition_name)

def generate_specific_string(medical_abstract_and_condition_name):
        specific_string = ""
        for medical_abstract, condition_name in medical_abstract_and_condition_name:
            specific_string += f"Prompt: {prompt}\nMedical_Abstract : {medical_abstract}\nCondition_Name: {condition_name}\n"
        return specific_string.strip()
specific_string = generate_specific_string(medical_abstract_and_condition_name)
print(specific_string)

data['k_shot_prompt'] = specific_string + "\n" + data['main_prompt']


['cardiovascular diseases' 'general pathological conditions'
 'nervous system diseases' 'neoplasms']
[('Ampullary tumor caused by metastatic renal cell carcinoma. In this paper we report the case of a renal cell carcinoma (RCC) metastatic to the ampullary region. The patient presented with severe anemia due to blood loss from the ampullary tumor 11 years after nephrectomy for the primary renal cancer. The diagnosis was established by means of endoscopy and biopsy. ', 'neoplasms'), ('Dental extraction for patients on oral anticoagulant therapy. Dental extraction in patients receiving long-term oral anticoagulant therapy is a controversial issue. Continuation of anticoagulation exposes the patient to serious hemorrhage, whereas cessation of therapy increases the risk of thromboembolism. Forty patients treated by coumarin underwent 63 tooth extractions, without a change in the therapeutic protocol of anticoagulation. The biologic adhesive Beriplast was used successfully to achieve local h

In [11]:
# Create the local LLM instance
llm = LLM(
    model="meta-llama/Llama-3.2-3B-Instruct",
    dtype="half",
    max_model_len=4096,
)

def process_batch(data_batch, model):
    sampling_params = SamplingParams(temperature=0.7)
    prompts = data_batch['main_prompt'].tolist()
    outputs = llm.generate(prompts, sampling_params)

    prediction_df_rows = []
    for index, output in enumerate(outputs):
        prompt = output.prompt
        predicted_label = output.outputs[0].text.strip()
        actual_label = data_batch.iloc[index]['actual_condition_name']
        prediction_df_rows.append({'prompt': prompt, 'predicted_label': predicted_label, 'actual_condition_name': actual_label})

        # Print progress after every 100 data points
        if (index + 1) % 100 == 0:
            print(f"Processed {index + 1} data points.")

    prediction_data = pd.DataFrame(prediction_df_rows)
    return prediction_data


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO 10-30 04:56:29 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='meta-llama/Llama-3.2-3B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-3.2-3B-Instruct, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_strea

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

INFO 10-30 04:56:30 model_runner.py:1056] Starting to load model meta-llama/Llama-3.2-3B-Instruct...
INFO 10-30 04:56:31 weight_utils.py:243] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 10-30 04:58:45 model_runner.py:1067] Loading model weights took 6.0160 GB
INFO 10-30 04:58:46 gpu_executor.py:122] # GPU blocks: 37370, # CPU blocks: 2340
INFO 10-30 04:58:46 gpu_executor.py:126] Maximum concurrency for 4096 tokens per request: 145.98x
INFO 10-30 04:58:49 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-30 04:58:49 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-30 04:58:59 model_runner.py:1523] Graph capturing finished in 11 secs.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
#### need to do inference here
prediction_data = process_batch(data, model = "meta-llama/Llama-3.2-3B-Instruct" )##model="unsloth/Llama-3.2-3B-Instruct"
prediction_data['predicted_label'] = prediction_data['predicted_label'].str.lower().str.strip()
print("Predicted Label:", prediction_data['predicted_label'])
prediction_data['actual_condition_name'] = prediction_data['actual_condition_name'].str.lower().str.strip()
print("Actual Label:", prediction_data['actual_condition_name'])
prediction_data['Match'] = np.where(prediction_data['predicted_label'] == prediction_data['actual_condition_name'], 1, 0)

accuracy = prediction_data['Match'].sum() / prediction_data.shape[0]
print(f"Accuracy of the model on Medical_Abstract: {accuracy:.2%}")

Processed prompts:   0%|          | 0/12 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 12/12 [00:00<00:00, 48.95it/s, est. speed input: 13811.62 toks/s, output: 783.83 toks/s]

Predicted Label: 0     please select one of the following: cardiovasc...
1     please respond in one of the following ways:\n...
2     ?\n## step 1: identify the main topic of the m...
3     please provide a single word indicating whethe...
4     ?\n\nafter carefully reading and analyzing the...
5     please provide a single word indicating whethe...
6     _\n\nplease provide a single word that indicat...
7     ?\n\nbased on the content of the medical abstr...
8     ? \n\n## step 1: identify the main topic of th...
9     ___________\nclassification:_____________\n\na...
10    _____________\n\n## step 1:  analyze the medic...
11    _______________________________________\n\nana...
Name: predicted_label, dtype: object
Actual Label: 0             cardiovascular diseases
1     general pathological conditions
2             nervous system diseases
3     general pathological conditions
4     general pathological conditions
5     general pathological conditions
6                           neop


