In [2]:
import datasets
import pandas as pd

from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

  from .autonotebook import tqdm as notebook_tqdm


## Init Dataset

In [3]:
df = pd.DataFrame(columns=['question', 'answer'])
df

Unnamed: 0,question,answer


## Mine Data

In [4]:
medmcqa = datasets.load_dataset('openlifescienceai/medmcqa')
medmcqa

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 182822
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 6150
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 4183
    })
})

In [5]:
medmcqa_train_df = pd.DataFrame(medmcqa['train'])
medmcqa_test_df = pd.DataFrame(medmcqa['test'])
medmcqa_validation_df = pd.DataFrame(medmcqa['validation'])
medmcqa_df = pd.concat(
  [
    medmcqa_train_df,
    medmcqa_test_df,
    medmcqa_validation_df
  ],
  ignore_index=True,
)
medmcqa_df

Unnamed: 0,id,question,opa,opb,opc,opd,cop,choice_type,exp,subject_name,topic_name
0,e9ad821a-c438-4965-9f77-760819dfa155,Chronic urethral obstruction due to benign pri...,Hyperplasia,Hyperophy,Atrophy,Dyplasia,2,single,Chronic urethral obstruction because of urinar...,Anatomy,Urinary tract
1,e3d3c4e1-4fb2-45e7-9f88-247cc8f373b3,Which vitamin is supplied from only animal sou...,Vitamin C,Vitamin B7,Vitamin B12,Vitamin D,2,single,Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. ...,Biochemistry,Vitamins and Minerals
2,5c38bea6-787a-44a9-b2df-88f4218ab914,All of the following are surgical options for ...,Adjustable gastric banding,Biliopancreatic diversion,Duodenal Switch,Roux en Y Duodenal By pass,3,multi,"Ans. is 'd' i.e., Roux en Y Duodenal Bypass Ba...",Surgery,Surgical Treatment Obesity
3,cdeedb04-fbe9-432c-937c-d53ac24475de,Following endaerectomy on the right common car...,Central aery of the retina,Infraorbital aery,Lacrimal aery,Nasociliary aretry,0,multi,The central aery of the retina is a branch of ...,Ophthalmology,
4,dc6794a3-b108-47c5-8b1b-3b4931577249,Growth hormone has its effect on growth through?,Directly,IG1-1,Thyroxine,Intranuclear receptors,1,single,"Ans. is 'b' i.e., IGI-1GH has two major functi...",Physiology,
...,...,...,...,...,...,...,...,...,...,...,...
193150,51234bc7-9170-45cc-8494-837fe23af441,A study is to be conducted with regards to the...,Case control,Prospective coho,Longitudinal study,Ambispective,0,single,Ans. A. Since there is a comparison between fa...,Social & Preventive Medicine,
193151,b0a05a95-cb2d-4ec4-9e19-c25742e18221,APGAR acronym stands for?,"Activity, pulse pressure, grimace, appearance,...","Appearance, pressure, grimace, MAP, hea rate","Appearance, pressure, grimace, appearance, rat...","Appearance, pulse, grimace, activity, respiration",3,single,0 (Points) 1 2 Appearance Blue or pale all ove...,Pediatrics,AIIMS 2017
193152,f07a7a11-ae84-4843-a8a2-9f77447fb954,Most commonly implicated drug for acute liver ...,Paracetamol,Valproate,Warfarin,Tetracyclines,0,single,Ans. A. Paracetamol Acute liver failure after ...,Pharmacology,
193153,5a5dcbce-041f-45ee-8a1d-2ec6b23d5f82,A 9 year old boy has steroid dependent nephrot...,Longterm frusemide with enalapril,Cyclophosphamide,Intravenous immunoglobulin,Intravenous pulse corticosteroids,1,multi,If a steroid dependent patient develops severe...,Pediatrics,


In [6]:
llm = Ollama(temperature=0, model='llama3')

In [7]:
is_neurobiology_question_schema = ResponseSchema(
  name='is_neurobiology_question',
  description='Is this question related to neurobiology? Answer true if yes, false if not or unknown.',
)

response_schemas = [is_neurobiology_question_schema]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [8]:
format_instructions = output_parser.get_format_instructions()
template_string = """
Given the following question and its answer, determine if it's related to neurobiology.
Answer true if yes, false if not or unknown. And nothing more.

{format_instructions}

question: {question}
answer: {answer}
"""

prompt = PromptTemplate(
  template=template_string,
  input_variables=['question', 'answer'],
  partial_variables={'format_instructions': format_instructions},
)

In [9]:
def json_fix_parser(response):
  response = response.replace('```json', '')
  response = response.replace('```', '')
  return response

In [10]:
chain = prompt | llm | json_fix_parser | output_parser

In [11]:
index_to_column = {
  0: 'opa',
  1: 'opb',
  2: 'opc',
  3: 'opd',
}

def extract_qa_from_medmcqa(row):
  question = row['question']
  correct_answer_index = row['cop']

  if correct_answer_index in index_to_column:
    answer = row[index_to_column[correct_answer_index]]
  else:
    answer = None

  return question, answer

In [12]:
for index, row in medmcqa_df.iterrows():
  question, answer = extract_qa_from_medmcqa(row)

  if answer == None:
    continue

  try:
    df_row = {'question': question, 'answer': answer}
    response = chain.invoke(df_row)

    if response['is_neurobiology_question']:
      df = pd.concat([df, pd.DataFrame([df_row])], ignore_index=True)
  except:
    pass

  if index % 100 == 0:
    df.to_csv('medmcqa.csv', index=False)

df