In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import ParameterGrid
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import Accelerator
from accelerate.utils import gather_object
from statistics import mean
import transformers
import torch, time, json
import pandas as pd
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:40"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

accelerator = Accelerator()

# load a base model and tokenizer
access_token = #your token here

model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token, cache_dir='./')
model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token, cache_dir='./',
                                             device_map='auto',
                                             torch_dtype=torch.bfloat16)

In [2]:
def compute_metrics(predictions, references, labels=None, pos_label=1, average="weighted", sample_weight=None, zero_division='warn'):
        f1 = f1_score(
            references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight
        )
        p = precision_score(
            references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight,
            zero_division=zero_division
        )
        r = recall_score(
            references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight,
            zero_division=zero_division
        )
        c = classification_report(
            references, predictions, labels=labels
        )
        print(c)
        return {"f1": float(f1) if f1.size == 1 else f1,
                "precision": float(p) if p.size == 1 else p,
                "recall": float(r) if r.size == 1 else r}

def train_sentence_selection(df, n_neutral, n_positive, n_negative, label, seed):
    # saving column name given PT or MD label
    label = f"{label}_label"
    # randomly selecting 1-2 sentences per label
    neutral_sentences = df[df[label] == "neutral"].sample(n_neutral, replace=False, random_state=seed)
    positive_sentences = df[df[label] == "positive"].sample(n_positive, replace=False, random_state=seed)
    negative_sentences = df[df[label] == "negative"].sample(n_negative, replace=False, random_state=seed)
    all_sentences = pd.concat([neutral_sentences,
                               positive_sentences,
                               negative_sentences], ignore_index = True)
    return all_sentences

### Zero-shot approach
Below is code to evaluate the model with a zero-shot approach.
Update the right dataset and file path.

In [2]:
## zero shot approach
accelerator.wait_for_everyone()
start=time.time()

dataset = "80" #example, evaluating the model on sentences with at least 80% agreement

test = pd.read_csv(f"yourpath/data/test_{dataset}.csv")

# converting test sentences to json format
json_test_sentences = test["language"].to_json()

# creating context prompt

SYSTEM_PROMPT = """
<|start_header_id|>system<|end_header_id|>
You are a doctor familair with medical jargon that writes many clinical notes about patients.<|eot_id|>
"""

PROMPTS = []

for sentence in test["language"]:
    
    USER_PROMPT_1 = f"""
    <|start_header_id|>user<|end_header_id|>
    Your task is to analyze the sentiment of a sentences you wrote about a patient.
    For each sentence, what is your attitude towards the patient you wrote about?
    Answer the question by assigning a sentiment score of negative, neutral, or positive for the sentence.
    Only output your sentiment score in JSON format for this sentence:
    {{"0":\"{sentence}\"}}<|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """
    
    PROMPT = [f"""
    {SYSTEM_PROMPT}
    {USER_PROMPT_1}
    """]
    PROMPTS.append(PROMPT)


with accelerator.split_between_processes(PROMPTS) as prompts:
    # store output of generations in dict   
    results=dict(outputs=[])

    # have each GPU do inference, prompt by prompt
    for prompt in prompts:
        prompt_tokenized=tokenizer(prompt, return_tensors="pt").to("cuda")
        output_tokenized = model.generate(**prompt_tokenized, max_new_tokens=10, do_sample=True, temperature=0.001, pad_token_id=tokenizer.eos_token_id)[0]

        # remove prompt from output
        output_tokenized=output_tokenized[len(prompt_tokenized["input_ids"][0]):]

        # store outputs and number of tokens in result{}
        results["outputs"].append( tokenizer.decode(output_tokenized) )
#         results["num_tokens"].append(len(output_tokenized))

        
    results=[ results ] # transform to list, otherwise gather_object() will not collect correctly

    # collect results from all the GPUs
results_gathered=gather_object(results)
# print(results_gathered)

results = results_gathered[0]['outputs']
parsed_results = []
for label in results:
    label = label.strip().replace('<|eot_id|>', '')
    parsed_results.append(label)
    
parsed_data = [json.loads(item) for item in parsed_results]
df = pd.DataFrame(parsed_data)
df.columns = ['Model_label']

results = compute_metrics(df['Model_label'], test["MD_label"])
print(results)

error_analysis = pd.concat([test[["idx", "language", "MD_PT_label", "MD_label"]], df['Model_label']], axis=1)
error_analysis = error_analysis.rename(columns={'Model_label':"pred", "MD_label":"true"})
mask = error_analysis["pred"] == error_analysis["true"]
error_analysis = error_analysis[~ mask]
error_analysis.to_csv(f'MD_error_analysis_zero_shot_validation_{dataset}.csv', index=False)
print('-' * 100)
print('\n\n')


NameError: name 'accelerator' is not defined

In [6]:
## best CLI performance on test sentences
accelerator.wait_for_everyone()
start=time.time()

dataset = "60"

train = pd.read_csv(f"/sc/arion/projects/mscic1/psych_nlp/sentiment_analysis/MD_task/LLAMA3/best_CLI_sentences_{dataset}.csv")
test = pd.read_csv(f"/sc/arion/projects/mscic1/psych_nlp/sentiment_analysis/data/validation_sentences.csv")

#context sentences in json format
json_train_sentences = train["language"].to_json()
json_train_labels = train["MD_label"].to_json()

# converting test sentences to json format
json_test_sentences = test["language"].to_json()

# creating context prompt

SYSTEM_PROMPT = """
<|start_header_id|>system<|end_header_id|>
You are a doctor familair with medical jargon that writes many clinical notes about patients.<|eot_id|>
"""

USER_PROMPT_1 = f"""
<|start_header_id|>user<|end_header_id|>
Your task is to analyze the sentiment of a series of sentences you wrote about patients.
For each sentence, what is your attitude towards the patient you wrote about?
Answer the question by assigning a sentiment score of negative, neutral, or positive for each sentence.
Output your anser in JSON format. Don’t add explanation beyond the JSON.
Below are some example sentences in JSON format:
{json_train_sentences}<|eot_id|>
"""

ASSISTANT_PROMPT = f"""
<|start_header_id|>assistant<|end_header_id|>
f{json_train_labels}<|eot_id|>
"""

PROMPTS = []

for sentence in test["language"]:
    
    USER_PROMPT_2 = f"""
    <|start_header_id|>user<|end_header_id|>
    Complete the same task with this sentence and only return your sentiment score in JSON format:\n"{sentence}"<|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """
    
    PROMPT = [f"""
    {SYSTEM_PROMPT}
    {USER_PROMPT_1}
    {ASSISTANT_PROMPT}
    {USER_PROMPT_2}
    """]
    PROMPTS.append(PROMPT)


with accelerator.split_between_processes(PROMPTS) as prompts:
    # store output of generations in dict   
    results=dict(outputs=[])

    # have each GPU do inference, prompt by prompt
    for prompt in prompts:
        prompt_tokenized=tokenizer(prompt, return_tensors="pt").to("cuda")
        output_tokenized = model.generate(**prompt_tokenized, max_new_tokens=10, do_sample=True, temperature=0.001, pad_token_id=tokenizer.eos_token_id)[0]

        # remove prompt from output
        output_tokenized=output_tokenized[len(prompt_tokenized["input_ids"][0]):]

        # store outputs and number of tokens in result{}
        results["outputs"].append( tokenizer.decode(output_tokenized) )
#         results["num_tokens"].append(len(output_tokenized))

        
    results=[ results ] # transform to list, otherwise gather_object() will not collect correctly

    # collect results from all the GPUs
results_gathered=gather_object(results)

# if accelerator.is_main_process:
#     timediff=time.time()-start
#     num_tokens=sum([r["num_tokens"] for r in results_gathered ])

#     print(f"tokens/sec: {num_tokens//timediff}, time {timediff}, total tokens {num_tokens}, total prompts {len(PROMPT)}")

results = results_gathered[0]['outputs']
parsed_results = []
for label in results:
    label = label.strip().replace('<|eot_id|>', '')
    parsed_results.append(label)
    
parsed_data = [json.loads(item) for item in parsed_results]
df = pd.DataFrame(parsed_data)
df.columns = ['Model_label']

results = compute_metrics(df['Model_label'], test["MD_label"])
print(results)

error_analysis = pd.concat([test[["idx", "language", "MD_PT_label", "MD_label"]], df['Model_label']], axis=1)
# print(error_analysis)
error_analysis = error_analysis.rename(columns={'Model_label':"pred", "MD_label":"true"})
mask = error_analysis["pred"] == error_analysis["true"]
error_analysis = error_analysis[~ mask]
error_analysis.to_csv(f'error_analysis_test_{dataset}.csv', index=False)
print('-' * 100)
print('\n\n')

              precision    recall  f1-score   support

    negative       0.83      1.00      0.91        10
     neutral       0.00      0.00      0.00         3
    positive       0.67      1.00      0.80         2

    accuracy                           0.80        15
   macro avg       0.50      0.67      0.57        15
weighted avg       0.64      0.80      0.71        15

{'f1': 0.7127272727272727, 'precision': 0.6444444444444445, 'recall': 0.8}
----------------------------------------------------------------------------------------------------





  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
results_gathered

[{'outputs': [' {\n  "sentiment": "positive"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "positive"\n}',
   ' {\n  "sentiment": "positive"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "neutral"\n}',
   ' {\n  "sentiment": "neutral"\n}',
   ' {\n  "sentiment": "neutral"\n}',
   ' {\n  "sentiment": "positive"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "neutral"\n}',
   ' {\n  "sentiment": "positive"\n}',
   ' {\n  "sentiment": "positive"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "positive"\n}',
   ' {\n  "sentiment": "positive"\n}',
   ' {\n  "sentiment": "neutral"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "negative"\n}',
   ' {\n  "sentiment": "positive"\n}',
   ' {\n  "sentimen

In [3]:
# CLI experiment

accelerator.wait_for_everyone()
start=time.time()
# model.generation_config.temperature=None

params = {
    'seed': [42],
    'n_neutral_sentences': [0, 1, 2, 3, 4, 5, 6],
    'n_positive_sentences': [0, 1, 2],
    'n_negative_sentences': [0, 1, 2, 3]
}

dataset = "60"
metrics_file = f'MD_context_metrics_{dataset}.csv'
if os.path.isfile(metrics_file):
    f = open(metrics_file, 'a')
else:
    f = open(metrics_file, 'w')
    f.write('seed,n_neutral_sentences,n_positive_sentences,n_negative_sentences,f1,precision,recall\n')

best_model = []
best_f1 = 0.0
best_comb, best_results = None, None
for comb in list(ParameterGrid(params)):
    print(f"Testing with: {comb['n_negative_sentences']} negative, {comb['n_neutral_sentences']} neutral, {comb['n_positive_sentences']} positive.")
    train = pd.read_csv(f"/sc/arion/projects/mscic1/psych_nlp/sentiment_analysis/data/train_{dataset}.csv")
    test = pd.read_csv(f"/sc/arion/projects/mscic1/psych_nlp/sentiment_analysis/data/test_{dataset}.csv")
    # randomly selecting context sentences in json format
    train_sentences = train_sentence_selection(train,
                                               comb['n_neutral_sentences'],
                                               comb['n_positive_sentences'],
                                               comb['n_negative_sentences'],
                                               "MD",
                                               comb['seed'])
    json_train_sentences = train_sentences["language"].to_json()
    json_train_labels = train_sentences["MD_label"].to_json()

    # converting test sentences to json format
    json_test_sentences = test["language"].to_json()
    sentence_count = test['language'].size

    SYSTEM_PROMPT = """
    <|start_header_id|>system<|end_header_id|>
    You are a doctor familair with medical jargon that writes many clinical notes about patients.<|eot_id|>
    """

    USER_PROMPT_1 = f"""
    <|start_header_id|>user<|end_header_id|>
    Your task is to analyze the sentiment of a sentence you wrote about a patient.
    For each sentence, what is your attitude towards the patient you wrote about?
    Answer the question by assigning a sentiment score of negative, neutral, or positive for the sentence.
    Output your anser in JSON format. Don’t add explanation beyond the JSON.
    Below are some example sentences in JSON format:
    {json_train_sentences}<|eot_id|>
    """

    ASSISTANT_PROMPT = f"""
    <|start_header_id|>assistant<|end_header_id|>
    f{json_train_labels}<|eot_id|>
    """

    PROMPTS = []

    for sentence in test["language"]:

        USER_PROMPT_2 = f"""
        <|start_header_id|>user<|end_header_id|>
        Complete the same task with this sentence and only return your sentiment score in JSON format:\n"{sentence}"<|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>
        """

        PROMPT = [f"""
        {SYSTEM_PROMPT}
        {USER_PROMPT_1}
        {ASSISTANT_PROMPT}
        {USER_PROMPT_2}
        """]
        PROMPTS.append(PROMPT)


    with accelerator.split_between_processes(PROMPTS) as prompts:
        # store output of generations in dict   
        results=dict(outputs=[])

        # have each GPU do inference, prompt by prompt
        for prompt in prompts:
            prompt_tokenized=tokenizer(prompt, return_tensors="pt").to("cuda")
            output_tokenized = model.generate(**prompt_tokenized, max_new_tokens=10, do_sample=True, temperature=0.001, pad_token_id=tokenizer.eos_token_id)[0]

            # remove prompt from output
            output_tokenized=output_tokenized[len(prompt_tokenized["input_ids"][0]):]

            # store outputs and number of tokens in result{}
            results["outputs"].append( tokenizer.decode(output_tokenized) )
    #         results["num_tokens"].append(len(output_tokenized))


        results=[ results ] # transform to list, otherwise gather_object() will not collect correctly

    # collect results from all the GPUs
    results_gathered=gather_object(results)

    # if accelerator.is_main_process:
    #     timediff=time.time()-start
    #     num_tokens=sum([r["num_tokens"] for r in results_gathered ])

    #     print(f"tokens/sec: {num_tokens//timediff}, time {timediff}, total tokens {num_tokens}, total prompts {len(PROMPT)}")

    results = results_gathered[0]['outputs']
    parsed_results = []
    for label in results:
        label = label.strip().replace('<|eot_id|>', '')
        parsed_results.append(label)

    parsed_data = [json.loads(item) for item in parsed_results]
    df = pd.DataFrame(parsed_data)
    df.columns = ['Model_label']

    results = compute_metrics(df['Model_label'], test["MD_label"])
    print(results)

    
    # saving results to metrics sheet
    v = [comb['seed'], comb['n_neutral_sentences'], comb['n_positive_sentences'], comb['n_negative_sentences'],
         results['f1'], results['precision'], results['recall']]
    f.write(','.join([str(el) for el in v]) + '\n')

    if results['f1'] > best_f1:
        best_f1 = results['f1']
        best_comb = comb
        best_results = results
        best_CLI_sentences = train_sentences
        best_CLI_sentences.to_csv(f'best_CLI_sentences_{dataset}.csv', index=False)
        error_analysis = pd.concat([test[["idx", "language", "MD_PT_label", "MD_label"]], df['Model_label']], axis=1)
    #   print(error_analysis)
        error_analysis = error_analysis.rename(columns={'Model_label':"pred", "MD_label":"true"})
        mask = error_analysis["pred"] == error_analysis["true"]
        error_analysis = error_analysis[~ mask]
        error_analysis.to_csv(f'error_analysis_validation_ICL_{dataset}.csv', index=False)
    print('-' * 100)
    print('\n\n')
    print('-' * 100)
    print('\n\n')

    if best_comb is not None:
        print(f'Best combination of context sentences: {best_comb}')
        print('\n')
        print(f'Best results: {best_results}')
    torch.cuda.empty_cache()
f.close()

Testing with: 0 negative, 0 neutral, 0 positive.
              precision    recall  f1-score   support

    negative       0.75      1.00      0.86         9
     neutral       1.00      0.67      0.80        15
    positive       0.67      1.00      0.80         4

    accuracy                           0.82        28
   macro avg       0.81      0.89      0.82        28
weighted avg       0.87      0.82      0.82        28

{'f1': 0.8183673469387756, 'precision': 0.8720238095238095, 'recall': 0.8214285714285714}
----------------------------------------------------------------------------------------------------



----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 0, 'n_neutral_sentences': 0, 'n_positive_sentences': 0, 'seed': 42}


Best results: {'f1': 0.8183673469387756, 'precision': 0.8720238095238095, 'recall': 0.8214285714285714}
Testing with: 0 negative, 0 neutra