In [1]:
import datasets
import pandas as pd

from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

  from .autonotebook import tqdm as notebook_tqdm


## Init Dataset

In [2]:
df = pd.DataFrame(columns=['question', 'answer'])
df

Unnamed: 0,question,answer


## Mine Data

In [3]:
mediqa = datasets.load_dataset('bigbio/mediqa_qa')
mediqa

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train_live_qa_med: Dataset({
        features: ['QUESTION'],
        num_rows: 104
    })
    train_alexa: Dataset({
        features: ['QUESTION'],
        num_rows: 104
    })
    validation: Dataset({
        features: ['QUESTION'],
        num_rows: 25
    })
    test: Dataset({
        features: ['QUESTION'],
        num_rows: 150
    })
})

In [4]:
question_dicts = pd.DataFrame(mediqa['train_live_qa_med'])['QUESTION'].tolist() \
  + pd.DataFrame(mediqa['train_alexa'])['QUESTION'].tolist() \
  + pd.DataFrame(mediqa['validation'])['QUESTION'].tolist()

mediqa_df = pd.DataFrame(columns=['question', 'answer'])

for question_dict in question_dicts:
  question = question_dict['QuestionText']
  answer = None

  for answer_dict in question_dict['AnswerList']:
    # https://github.com/abachaa/MEDIQA2019/blob/master/MEDIQA_Task3_QA/Task3_README.txt
    if answer_dict['Answer']['ReferenceScore'] == 4:
      answer = answer_dict['Answer']['AnswerText']
      break

  if answer == None:
    continue

  row = {'question': question, 'answer': answer}
  mediqa_df = pd.concat([mediqa_df, pd.DataFrame([row])], ignore_index=True)

mediqa_df

Unnamed: 0,question,answer
0,Noonan syndrome. What are the references with ...,Noonan syndrome: Noonan syndrome is a genetic ...
1,vdrl positive. vdrl positive patients please t...,VDRL test (What Abnormal Results Mean): A posi...
2,"Macular Degeneration. I am a non-smoker , reti...",Dry macular degeneration: Dry macular degenera...
3,molar pregnancy.. is conception a requirement ...,Molar pregnancy: A molar pregnancy - also know...
4,vasculitis. Yes my wife has been dianosed with...,Vasculitis (Treatment): Treatment of vasculiti...
...,...,...
158,I want more information on Hypertension and fi...,Fibromyalgia: - Fibromyalgia is a long-lasting...
159,Jaundice. Can older infants get jaundice? What...,Newborn jaundice - what to ask your doctor: Ne...
160,latest information. WANT TO KNOW LATEST DEVELO...,Lung Cancer: The lungs are a pair of cone-shap...
161,"lupus. Hi, I want to know about Lupus and its ...",Lupus: Lupus is an autoimmune disease that ca...


In [5]:
llm = Ollama(temperature=0, model='llama3')

In [6]:
is_neurobiology_question_schema = ResponseSchema(
  name='is_neurobiology_question',
  description='Is this question related to neurobiology? Answer true if yes, false if not or unknown.',
)

response_schemas = [is_neurobiology_question_schema]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [7]:
format_instructions = output_parser.get_format_instructions()
template_string = """
Given the following question and its answer, determine if it's related to neurobiology.
Answer true if yes, false if not or unknown. And nothing more.

{format_instructions}

question: {question}
answer: {answer}
"""

prompt = PromptTemplate(
  template=template_string,
  input_variables=['question', 'answer'],
  partial_variables={'format_instructions': format_instructions},
)

In [8]:
def json_fix_parser(response):
  response = response.replace('```json', '')
  response = response.replace('```', '')
  return response

In [9]:
chain = prompt | llm | json_fix_parser | output_parser

In [10]:
index_to_column = {
  0: 'opa',
  1: 'opb',
  2: 'opc',
  3: 'opd',
}

def extract_qa_from_medmcqa(row):
  question = row['question']
  correct_answer_index = row['cop']

  if correct_answer_index in index_to_column:
    answer = row[index_to_column[correct_answer_index]]
  else:
    answer = None

  return question, answer

In [11]:
for index, row in mediqa_df.iterrows():
  try:
    df_row = {'question': row['question'], 'answer': row['answer']}
    response = chain.invoke(df_row)

    if response['is_neurobiology_question']:
      df = pd.concat([df, pd.DataFrame([df_row])], ignore_index=True)
  except Exception as e:
    print(e)
    pass

df

Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)
Got invalid JSON object. Error: Expect

Unnamed: 0,question,answer
0,SSPE. My son is 33years of age and did not hav...,Subacute sclerosing panencephalitis: Subacute ...
1,Homozygout MTHFR A1298C Health Issues and long...,MTHFR gene variant (Inheritance): Because each...
2,What is Stroke?,Stroke: A stroke occurs when the blood supply ...
3,What causes Stroke?,Ischemic Stroke (Summary): Summary A stroke is...
4,What are the symptoms of Stroke?,What are the symptoms of Stroke?: The signs an...
5,What are the treatments of Stroke?,Stroke (Treatment): A stroke is a medical emer...
6,What is Dementia?,Dementia (WHAT IS DEMENTIA?): Dementia is the ...
7,What causes Dementia?,What causes Dementia?: Dementia usually occurs...
8,What are the symptoms of Dementia?,Dementia (Symptoms): Dementia symptoms include...
9,How to diagnose Dementia?,Dementia (Diagnosis): Diagnosing dementia and ...


In [12]:
df.to_csv('mediqa.csv', index=False)