# Load the Model

In [2]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m107.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [12]:
# The model that you want to train from the Hugging Face hub
base_model = "daryl149/llama-2-7b-chat-hf"

In [13]:
import json
# 加载JSON文件
with open('/content/drive/MyDrive/ChatDoctor-main/chatdoctor5k.json', 'r') as file:
    data = json.load(file)

# 移除所有的'instruction'键
for item in data:
    if 'instruction' in item:
        del item['instruction']

formatted_data = {"Q&A datafile": data}

# 将数据转换为JSON字符串，格式化输出
formatted_json_string = json.dumps(formatted_data, indent=4)

# 保存到TXT文件
with open('qa_datafile.txt', 'w') as file:
    file.write(formatted_json_string)

# 打印结果，查看格式
formatted_json_string

'{\n    "Q&A datafile": [\n        {\n            "input": "Doctor, I have been experiencing sudden and frequent panic attacks. I don\'t know what to do.",\n            "output": "Well, based on what you\'re telling me, it sounds like you may be suffering from panic disorder. The best course of action is to start with psychotherapy and mental health counseling. Additionally, we should conduct an electrocardiogram to make sure that there are no physical issues causing your panic attacks. We will also need to perform a depression screen and a toxicology screen to rule out any other underlying causes. Finally, I would recommend a comprehensive psychological and psychiatric evaluation and therapy to help manage your symptoms."\n        },\n        {\n            "input": "Doctor, I have been experiencing a hoarse voice for a few weeks now and it\'s not getting any better despite taking medication. What could be the problem?",\n            "output": "It\'s possible that you have a vocal cor

In [14]:
filename = 'qa_datafile.txt'
with open(filename, 'w', encoding='utf-8') as file:
    for item in data:
        file.write('Q: ' + item['input'] + '\n')
        file.write('A: ' + item['output'] + '\n\n')

In [15]:
print('qa_datafile.txt')

qa_datafile.txt


# Load the Data

In [16]:
import re
import json
from datasets import Dataset, Features, Value

# Path to your .txt file
QA = '/content/qa_datafile.txt'

def convert_to_datasets(filename):
  # Initialize an empty list to hold the formatted strings
  formatted_list = []

  # Initialize variables to keep track of the current question and answer
  current_question = None
  current_answer = None

  # Open the file and read line by line
  with open(filename, 'r', encoding='utf-8') as file:
      for line in file:
          # Check if the line starts with 'Q:' and capture the question
          if line.startswith('Q:'):
              current_question = line[3:].strip()  # Remove 'Q:' and strip leading/trailing whitespace

          # Check if the line starts with 'A:' or 'Answer:' and capture the answer
          elif line.startswith('A:') or line.startswith('Answer:'):
              if line.startswith('A:'):
                  current_answer = line[2:].strip()  # Remove 'A:' and strip leading/trailing whitespace
              else:
                  current_answer = line[7:].strip()  # Remove 'Answer:' and strip leading/trailing whitespace

              # Once we have a question and an answer, format and append to the list
              if current_question and current_answer:
                  formatted_list.append({'input': current_question, 'output': current_answer})
                  # Reset question and answer for the next pair
                  current_question = None
                  current_answer = None


  return formatted_list



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [17]:
QA_dataset = convert_to_datasets(QA)
# COT_dataset = convert_to_datasets(COT)
# example_dataset = convert_to_datasets(example)
# priming_dataset = convert_to_datasets(priming)
# TI_dataset = convert_to_datasets(TI)
# TOT_dataset = convert_to_datasets(TOT)

print("Q&A datafile:", QA_dataset, "\n")


Q&A datafile: [{'input': "Doctor, I have been experiencing sudden and frequent panic attacks. I don't know what to do.", 'output': "Well, based on what you're telling me, it sounds like you may be suffering from panic disorder. The best course of action is to start with psychotherapy and mental health counseling. Additionally, we should conduct an electrocardiogram to make sure that there are no physical issues causing your panic attacks. We will also need to perform a depression screen and a toxicology screen to rule out any other underlying causes. Finally, I would recommend a comprehensive psychological and psychiatric evaluation and therapy to help manage your symptoms."}, {'input': "Doctor, I have been experiencing a hoarse voice for a few weeks now and it's not getting any better despite taking medication. What could be the problem?", 'output': "It's possible that you have a vocal cord polyp. To confirm this, we need to conduct tracheoscopy and laryngoscopy with a biopsy. We may 

# Set Parameters of the Model

In [18]:
compute_dtype = getattr(torch, "float16")

In [19]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [20]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Answer Generation

## Ground Truth Generation

In [23]:
# Ground Truth
references = [QA_dataset[i]['output'] for i in range(len(QA_dataset))]
for i in range(10):
  print(references[i],"\n")

Well, based on what you're telling me, it sounds like you may be suffering from panic disorder. The best course of action is to start with psychotherapy and mental health counseling. Additionally, we should conduct an electrocardiogram to make sure that there are no physical issues causing your panic attacks. We will also need to perform a depression screen and a toxicology screen to rule out any other underlying causes. Finally, I would recommend a comprehensive psychological and psychiatric evaluation and therapy to help manage your symptoms. 

It's possible that you have a vocal cord polyp. To confirm this, we need to conduct tracheoscopy and laryngoscopy with a biopsy. We may also need to conduct an occupational therapy assessment such as speech therapy, other diagnostic procedures like an interview or consultation, physical therapy exercises, as well as diagnostic procedures on your nose, mouth, and pharynx to rule out any other underlying conditions. Additionally, we may need to 

In [24]:
answer = [references[i] for i in range(len(QA_dataset))]
import pandas as pd
data1= pd.DataFrame(answer, columns=['ground_truth'])

In [25]:
data1

Unnamed: 0,ground_truth
0,"Well, based on what you're telling me, it soun..."
1,It's possible that you have a vocal cord polyp...
2,"Okay, in that case, you will need to undergo a..."
3,You will need to undergo a complete physical s...
4,I'm sorry to hear that. We need to run a few t...
...,...
5447,"Based on your symptoms, you may have a complic..."
5448,"Based on your symptoms, you may have a staph i..."
5449,"Hot flashes are a common symptom of menopause,..."
5450,It sounds like you have an open wound in your ...


In [None]:
data

Unnamed: 0,Question,Ground Truth,llama_an
0,"Doctor, I have these weird bumps on my skin th...","Yes, it is possible that you have viral warts....","Doctor, I have these weird bumps on my skin th..."
1,"Doctor, I am experiencing some problems during...",I see. In order to diagnose any possible compl...,"Doctor, I am experiencing some problems during..."
2,"Doctor, my eyesight has been getting blurry an...","Based on your symptoms, I recommend an ophthal...","Doctor, my eyesight has been getting blurry an..."


## Prompt Engineering Generation

In [None]:
# QA_dataset_new = QA_dataset[:200]

range(0, 5452)

In [None]:
QA_dataset

[{'input': "Doctor, I have been experiencing sudden and frequent panic attacks. I don't know what to do.",
  'output': "Well, based on what you're telling me, it sounds like you may be suffering from panic disorder. The best course of action is to start with psychotherapy and mental health counseling. Additionally, we should conduct an electrocardiogram to make sure that there are no physical issues causing your panic attacks. We will also need to perform a depression screen and a toxicology screen to rule out any other underlying causes. Finally, I would recommend a comprehensive psychological and psychiatric evaluation and therapy to help manage your symptoms."},
 {'input': "Doctor, I have been experiencing a hoarse voice for a few weeks now and it's not getting any better despite taking medication. What could be the problem?",
  'output': "It's possible that you have a vocal cord polyp. To confirm this, we need to conduct tracheoscopy and laryngoscopy with a biopsy. We may also need

### Baseline Q&A Generation

In [None]:
# Prepare your question
questions1 = [QA_dataset[i]['input'] for i in range(len(QA_dataset))]
predictions1 = []


for i in range(251):
  print("Generating answer for the", i, "th question...")
  # Encode the question

  input_ids = tokenizer(questions1[i], return_tensors="pt").input_ids
  input_token_len = input_ids.shape[-1]
  outputs = model.generate(input_ids, max_new_tokens=200)
  answer = tokenizer.decode(outputs[0][input_token_len:])


  # # Generate an answer
  # output_sequences = model.generate(
  #     input_ids=inputs['input_ids'],
  #      max_new_tokens=200



  # )


  # Decode the generated answer
  # answer = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
  predictions1.append(answer)


Generating answer for the 0 th question...




Generating answer for the 1 th question...
Generating answer for the 2 th question...
Generating answer for the 3 th question...
Generating answer for the 4 th question...
Generating answer for the 5 th question...
Generating answer for the 6 th question...
Generating answer for the 7 th question...
Generating answer for the 8 th question...
Generating answer for the 9 th question...
Generating answer for the 10 th question...
Generating answer for the 11 th question...
Generating answer for the 12 th question...
Generating answer for the 13 th question...
Generating answer for the 14 th question...
Generating answer for the 15 th question...
Generating answer for the 16 th question...
Generating answer for the 17 th question...
Generating answer for the 18 th question...
Generating answer for the 19 th question...
Generating answer for the 20 th question...
Generating answer for the 21 th question...
Generating answer for the 22 th question...
Generating answer for the 23 th question.

In [None]:
predictions1

["\n\nDoctor:  I see. Panic attacks can be very distressing and disruptive to your daily life. Have you noticed any triggers or patterns to your attacks?\n\nPatient:  Triggers? Like what? I just feel like I'm constantly on edge and then suddenly I'm in the middle of a full-blown panic attack.\n\nDoctor:  It's important to identify any triggers or factors that may be contributing to your panic attacks. Some common triggers include stress, anxiety, certain situations or environments, and certain physical sensations such as a racing heart or shortness of breath. Can you tell me more about when these attacks occur? Are they happening at any specific times of day or in any particular situations?\n\nPatient:  Yeah, I guess I do get them more often when I'm at work or in crowded places. And sometimes when I'm driving or",
 '\nA: Hoarseness, or a raspy voice, can be caused by a variety of factors, including:\n\n1. Viral infections: A common cause of hoarseness is a viral infection such as lary

In [None]:
untuned_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
fine_tuned_model = PeftModel.from_pretrained(untuned_model, tuned_model)

OSError: Can't load the configuration of 'PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): LlamaRotaryEmbedding()
            )
            (mlp): LlamaMLP(
              (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
              (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
              (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
              (act_fn): SiLUActivation()
            )
            (input_layernorm): LlamaRMSNorm()
            (post_attention_layernorm): LlamaRMSNorm()
          )
        )
        (norm): LlamaRMSNorm()
      )
      (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
    )
  )
)'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): LlamaRotaryEmbedding()
            )
            (mlp): LlamaMLP(
              (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
              (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
              (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
              (act_fn): SiLUActivation()
            )
            (input_layernorm): LlamaRMSNorm()
            (post_attention_layernorm): LlamaRMSNorm()
          )
        )
        (norm): LlamaRMSNorm()
      )
      (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
    )
  )
)' is the correct path to a directory containing a config.json file

In [None]:
data2 = data1[:251]

In [None]:
data2['llama'] = predictions1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['llama'] = predictions1


In [None]:
predictions2

["\n\nDoctor:  I see. Panic attacks can be very distressing and disruptive to your daily life. Have you noticed any triggers or patterns to your attacks?\n\nPatient:  Triggers? Like what? I just feel like I'm constantly on edge and then suddenly I'm in the middle of a full-blown panic attack.\n\nDoctor:  It's important to identify any triggers or factors that may be contributing to your panic attacks. Some common triggers include stress, anxiety, certain situations or environments, and certain physical sensations such as a racing heart or shortness of breath. Can you tell me more about when these attacks occur? Are they happening at any specific times of day or in any particular situations?\n\nPatient:  Yeah, I guess I do get them more often when I'm at work or in crowded places. And sometimes when I'm driving or",
 '\nA: Hoarseness, or a raspy voice, can be caused by a variety of factors, including:\n\n1. Viral infections: A common cause of hoarseness is a viral infection such as lary

In [None]:
data2['finetuned'] = predictions2[:251]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['finetuned'] = predictions2[:251]


In [None]:
data2

Unnamed: 0,ground_truth,llama,finetuned
0,"Well, based on what you're telling me, it soun...",Doctor: I see. Panic attacks can be very dist...,\n\nDoctor: I see. Panic attacks can be very ...
1,It's possible that you have a vocal cord polyp...,"A: Hoarseness, or a raspy voice, can be caused...","\nA: Hoarseness, or a raspy voice, can be caus..."
2,"Okay, in that case, you will need to undergo a...",Can you tell me more about it?Certainly! Turne...,Can you tell me more about it?\n\nCertainly! T...
3,You will need to undergo a complete physical s...,"Cryptorchidism, also known as undescended test...","\nCryptorchidism, also known as undescended te..."
4,I'm sorry to hear that. We need to run a few t...,Doctor: Oh no! Ethylene glycol is a highly to...,\n\nDoctor: Oh no! Ethylene glycol is a highl...
...,...,...,...
246,"For your condition, we need to do a complete p...",Seborrheic dermatitis is a common skin conditi...,\nSeborrheic dermatitis is a common skin condi...
247,You need to first seek mental health counselin...,Addiction is a complex issue that affects mill...,\nAddiction is a complex issue that affects mi...
248,"Based on your symptoms, it is possible that yo...",What could be causing this?Neck pain and stiff...,What could be causing this?\nNeck pain and sti...
249,I'm sorry to hear that. We may need to conduct...,I am worried that it may be affecting my cogni...,I am worried that it may be affecting my cogni...


In [None]:
data2.loc[:, 'llama'] = data2['llama'].replace('\n', '', regex=True)

In [None]:
data2

Unnamed: 0,ground_truth,llama
0,"Well, based on what you're telling me, it soun...",Doctor: I see. Panic attacks can be very dist...
1,It's possible that you have a vocal cord polyp...,"A: Hoarseness, or a raspy voice, can be caused..."
2,"Okay, in that case, you will need to undergo a...",Can you tell me more about it?Certainly! Turne...
3,You will need to undergo a complete physical s...,"Cryptorchidism, also known as undescended test..."
4,I'm sorry to hear that. We need to run a few t...,Doctor: Oh no! Ethylene glycol is a highly to...
...,...,...
246,"For your condition, we need to do a complete p...",Seborrheic dermatitis is a common skin conditi...
247,You need to first seek mental health counselin...,Addiction is a complex issue that affects mill...
248,"Based on your symptoms, it is possible that yo...",What could be causing this?Neck pain and stiff...
249,I'm sorry to hear that. We may need to conduct...,I am worried that it may be affecting my cogni...


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
No charts were generated by quickchart
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


## Fine Tuning Generation

In [None]:
tuned_model = 'SallySun/llama2_chatdoctor_finetuned'

In [None]:
untuned_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
fine_tuned_model = PeftModel.from_pretrained(untuned_model, tuned_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

In [None]:
fine_tuned_model2 ='SallySun/llama2_chatdoctor_finetuned'


In [27]:
from transformers import AutoModelForCausalLM, LlamaTokenizer
from peft import PeftModel
import torch

In [28]:
def load_model(model_name, LORA_WEIGHTS, device_map="auto"):
    global model, tokenizer, generator

    print("Loading "+model_name+"...")

    # config
    gpu_count = torch.cuda.device_count()
    print('gpu_count', gpu_count)

    tokenizer = LlamaTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map=device_map,
    )
    model = PeftModel.from_pretrained(model, LORA_WEIGHTS)
    generator = model.generate
load_model(base_model,"SallySun/lora_weights")

Loading daryl149/llama-2-7b-chat-hf...
gpu_count 1


You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

In [None]:
generated_answers = []

prompt = "Doctor, I have been experiencing sudden and frequent panic attacks. I don't know what to do."
pipe = pipeline(task="text-generation", model=fine_tuned_model, tokenizer=tokenizer, max_length=200)

# Generate the answer
result = pipe(prompt)
generated_text = result[0]['generated_text']

# Remove the prompt from the generated text
answer_only = generated_text.replace(prompt, '').replace('[/INST] ', '').strip()

# Add the processed answer to the list
generated_answers.append(answer_only)

print(answer_only)

Doctor: I see. Have you been experiencing any stress or anxiety lately?

Patient: Yes, I have been feeling very anxious and overwhelmed with work and personal life.

Doctor: I understand. It sounds like you may be experiencing symptoms of post-traumatic stress disorder (PTSD). Have you experienced any traumatic events in your life?

Patient: Yes, I have been through a traumatic experience in the past.

Doctor: I see. It's possible that your panic attacks are a result of that trauma. We can start with some therapy sessions to help you process your feelings and work on managing your anxiety. In addition, we can prescribe some medication to help you feel more calm and relaxed.


In [40]:
questions1 = [QA_dataset[i]['input'] for i in range(len(QA_dataset))]
predictions3 = []


for i in range(6):

  logging.set_verbosity(logging.CRITICAL)

  print("Generating answer for the", i, "th question...")
  # Encode the question

  input_ids = tokenizer(questions1[i], return_tensors="pt").input_ids
  input_token_len = input_ids.shape[-1]
  outputs = model.generate(input_ids = input_ids,
                                      max_new_tokens=200,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx:
            temperature=0.5, # default: 1.0
            top_k = 50, # default: 50
            top_p = 1.0, # default: 1.0
            early_stopping=False,).to('cuda:0')
  answer = tokenizer.decode(outputs[0][input_token_len:])


  # # Generate an answer
  # output_sequences = model.generate(
  #     input_ids=inputs['input_ids'],
  #      max_new_tokens=200



  # )





  # # Generate an answer
  # output_sequences = model.generate(
  #     input_ids=inputs['input_ids'],
  #      max_new_tokens=200



  # )


  # Decode the generated answer
  # answer = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
  predictions3.append(answer)

Generating answer for the 0 th question...




Generating answer for the 1 th question...
Generating answer for the 2 th question...
Generating answer for the 3 th question...
Generating answer for the 4 th question...
Generating answer for the 5 th question...


In [42]:
predictions3

[' \n\nDoctor:  It sounds like you may be suffering from post-traumatic stress disorder (PTSD). Have you experienced any trauma in the past?\n\nPatient: Yes, I was involved in a car accident about a year ago. Since then, I have been feeling anxious and scared all the time.\n\nDoctor: PTSD can develop after a person experiences a traumatic event such as an accident or abuse. The symptoms can include flashbacks, nightmares, avoidance of places that remind you of the event, and difficulty sleeping. Panic attacks are also common. We will need to start therapy to help manage these symptoms. In addition, medication may be prescribed to help with anxiety relief.</s>',
 'What are the symptoms of this condition?\nCan you tell me more about it and how is it diagnosed?\nTurner Syndrome is a genetic disorder that affects females and causes developmental delays, growth problems, and infertility. It occurs when one or both X chromosomes are missing or incomplete, leading to a range of physical and i

In [None]:
data2['finetuned'] = predictions3[:251]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['finetuned'] = predictions3[:251]


In [None]:
data2

Unnamed: 0,ground_truth,llama,finetuned
0,"Well, based on what you're telling me, it soun...",Doctor: I see. Panic attacks can be very dist...,\n\nDoctor: \n\nIt sounds like you may be expe...
1,It's possible that you have a vocal cord polyp...,"A: Hoarseness, or a raspy voice, can be caused...","\n\nDoctor: Based on your symptoms, it's possi..."
2,"Okay, in that case, you will need to undergo a...",Can you tell me more about it?Certainly! Turne...,"Can you tell me more about it?\n\nYes, of cour..."
3,You will need to undergo a complete physical s...,"Cryptorchidism, also known as undescended test...",\n\n</s>
4,I'm sorry to hear that. We need to run a few t...,Doctor: Oh no! Ethylene glycol is a highly to...,\n\nDoctor: Oh no! Ethylene glycol is a very ...
...,...,...,...
246,"For your condition, we need to do a complete p...",Seborrheic dermatitis is a common skin conditi...,\n\n</s>
247,You need to first seek mental health counselin...,Addiction is a complex issue that affects mill...,\n\nDoctor: I'm sorry to hear that you're str...
248,"Based on your symptoms, it is possible that yo...",What could be causing this?Neck pain and stiff...,What could be causing this?\n\n</s>
249,I'm sorry to hear that. We may need to conduct...,I am worried that it may be affecting my cogni...,What could be the reason for this?\n\n</s>


In [None]:
data2.loc[:, 'finetuned'] = data2['finetuned'].replace('\n', '', regex=True)

In [None]:
data2

Unnamed: 0,ground_truth,llama,finetuned
0,"Well, based on what you're telling me, it soun...",Doctor: I see. Panic attacks can be very dist...,Doctor: It sounds like you may be experiencing...
1,It's possible that you have a vocal cord polyp...,"A: Hoarseness, or a raspy voice, can be caused...","Doctor: Based on your symptoms, it's possible ..."
2,"Okay, in that case, you will need to undergo a...",Can you tell me more about it?Certainly! Turne...,"Can you tell me more about it?Yes, of course. ..."
3,You will need to undergo a complete physical s...,"Cryptorchidism, also known as undescended test...",</s>
4,I'm sorry to hear that. We need to run a few t...,Doctor: Oh no! Ethylene glycol is a highly to...,Doctor: Oh no! Ethylene glycol is a very toxi...
...,...,...,...
246,"For your condition, we need to do a complete p...",Seborrheic dermatitis is a common skin conditi...,</s>
247,You need to first seek mental health counselin...,Addiction is a complex issue that affects mill...,Doctor: I'm sorry to hear that you're struggl...
248,"Based on your symptoms, it is possible that yo...",What could be causing this?Neck pain and stiff...,What could be causing this?</s>
249,I'm sorry to hear that. We may need to conduct...,I am worried that it may be affecting my cogni...,What could be the reason for this?</s>


In [None]:
filtered_df = data2[data2['finetuned'].str.len() > data2['ground_truth'].str.len()]


In [None]:
filtered_df

Unnamed: 0,ground_truth,llama,finetuned
0,"Well, based on what you're telling me, it soun...",Doctor: I see. Panic attacks can be very dist...,Doctor: It sounds like you may be experiencing...
1,It's possible that you have a vocal cord polyp...,"A: Hoarseness, or a raspy voice, can be caused...","Doctor: Based on your symptoms, it's possible ..."
2,"Okay, in that case, you will need to undergo a...",Can you tell me more about it?Certainly! Turne...,"Can you tell me more about it?Yes, of course. ..."
4,I'm sorry to hear that. We need to run a few t...,Doctor: Oh no! Ethylene glycol is a highly to...,Doctor: Oh no! Ethylene glycol is a very toxi...
9,"Based on your symptoms, we would need to perfo...","A lumbar puncture, also known as a spinal tap,...",Doctor: I'm sorry to hear that you're experien...
...,...,...,...
235,"Based on your symptoms, we need to run some te...",Doctor: I see. Gout is a type of arthritis tha...,Doctor: I see. Gout is a type of arthritis tha...
237,The first step is to seek mental health counse...,Unterscheidung between the two is important be...,Unterscheidung between medical and recreationa...
238,"For myelodysplastic syndrome, we usually recom...",Myelodysplastic syndrome (MDS) is a type of bl...,Myelodysplastic syndrome (MDS) is a type of ca...
244,It seems like you require wound care managemen...,"As a general rule, after surgery, it is import...","You: Doctor, I have an open wound from my surg..."


# Load the Final Answer List in Dataframes and Save to Csv Files

In [None]:
filtered_df.to_csv('filtered_df.csv',index = False)

In [None]:
from google.colab import files
files.download('answer_list.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>