In [1]:
from ast import literal_eval
import json

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm

In [2]:
tqdm.pandas()

# Load and drop incorrect data

In [3]:
df = pd.read_csv('/kaggle/input/healthtap-qa-pairs/healthtap_medical_qna_dataset_1.6m.csv')

df.dropna(subset=['answers', 'question'], inplace=True)
df = df[df.answers.str.contains("\[")]
df = df[df.answers.str.contains("\{")]

print(df.shape)
df.head()

(1600150, 7)


Unnamed: 0.1,Unnamed: 0,answers,main_category,question,question_url,related_topics,sub_category
0,0,"[{'doctor_name': 'Dr. Glynis Ablon', 'doctor_p...",vagina,Like 5 yrs ago to now I've had some reoccurrin...,/user_questions/1578652-like-5-yrs-ago-to-now-...,"['Does herpes cause burning?', 'Itching and bu...",vagina burning sensation
1,1,"[{'doctor_name': 'Dr. Charlene Sojico', 'docto...",vagina,My 6 year old daughter complains about vaginal...,/user_questions/812576-my-6-year-old-daughter-...,"['Vaginal itching', 'Convulsion', 'Female heal...",vagina burning sensation
2,2,[{'doctor_name': 'Dr. Pierrette Mimi Poinsett'...,vagina,My Friend is experiencing irritation and she s...,/user_questions/6976359-my-friend-is-experienc...,"['Vaginal irritation', 'Vaginitis', 'Female he...",vagina burning sensation
3,3,"[{'doctor_name': 'Dr. Hunter Handsfield', 'doc...",vagina,"My VG is itchy and feels irritated, no pain, n...",/user_questions/6731827-my-vg-is-itchy-and-fee...,"['Candida die off itching', 'Burning itching b...",vagina burning sensation
4,4,"[{'doctor_name': 'Dr. David Drewitz', 'doctor_...",vagina,My anus has been itchy and burning. Went away ...,/user_questions/759694-my-anus-has-been-itchy-...,"['Blood after a bowel movement', 'Blood and mu...",vagina burning sensation


### Evaluate the answers to get the actual lists

In [4]:
df.answers = df.answers.progress_apply(literal_eval)

HBox(children=(FloatProgress(value=0.0, max=1600150.0), HTML(value='')))




### Sample negative example within sub_category, within main_category, and anything

In [5]:
sub2main = (
    df[['main_category', 'sub_category']]
    .drop_duplicates()
    .set_index('sub_category')
    ['main_category'].to_dict()
)
len(sub2main.keys())

5048

In [6]:
qa_pairs = []

for i, (question, answers, sub, main) in enumerate(tqdm(df[['question', 'answers', 'sub_category', 'main_category']].values)):
    for d in answers:
        if type(d) == dict:
            qa_pairs.append([i, question, d['answer'], sub, d['doctor_name'], main])

HBox(children=(FloatProgress(value=0.0, max=1600150.0), HTML(value='')))




In [7]:
qa_df = pd.DataFrame(qa_pairs, columns=['qid', 'question', 'answer', 'sub_category', 'doctor_name', 'main_category'])
qa_df.head()

Unnamed: 0,qid,question,answer,sub_category,doctor_name,main_category
0,0,Like 5 yrs ago to now I've had some reoccurrin...,test to see if you are a carrier of herpes sim...,vagina burning sensation,Dr. Glynis Ablon,vagina
1,1,My 6 year old daughter complains about vaginal...,Make sure she is not. Wearing tight fitting un...,vagina burning sensation,Dr. Charlene Sojico,vagina
2,1,My 6 year old daughter complains about vaginal...,Take her to a Gyn. Gynecologists are trained t...,vagina burning sensation,Dr. Melissa Moffitt,vagina
3,2,My Friend is experiencing irritation and she s...,Followup with doc. Some women use yoghurt as a...,vagina burning sensation,Dr. Pierrette Mimi Poinsett,vagina
4,2,My Friend is experiencing irritation and she s...,Worrisome. Sounds like a self treatment for va...,vagina burning sensation,Dr. James Ferguson,vagina


## Drop extremely rare category

In [8]:
qa_df.main_category.value_counts().tail()

nausea             97
weigh              67
tone               65
hypogonadism       63
iniencephaly sp     3
Name: main_category, dtype: int64

In [9]:
qa_df = qa_df[qa_df.main_category != 'iniencephaly sp']

## Create negative samples

The negative samples will be taken from the sub categories. If it's not available, then it will be taken from the main category.

In [10]:
unique_qids = qa_df[['qid', 'sub_category', 'main_category']]

main_map = dict(list(unique_qids.drop_duplicates().groupby("main_category")))
main_qids_map = {k: v.qid.values for k, v in main_map.items()}

sub_map = dict(list(unique_qids.drop_duplicates().groupby("sub_category")))
sub_qids_map = {k: v.qid.values for k, v in sub_map.items()}

In [11]:
np.random.seed(0)
wrong_qids = []

for qid, sub, main in tqdm(unique_qids.values):
    others = sub_qids_map[sub][sub_qids_map[sub] != qid]
    
    if len(others) < 2:
        others = main_qids_map[main][main_qids_map[main] != qid]
    
    wrong_qids.append(np.random.choice(others, 2))

HBox(children=(FloatProgress(value=0.0, max=2287922.0), HTML(value='')))




In [12]:
wrong_qids = np.vstack(wrong_qids)

In [13]:
qa_df['wrong_qid_1'] = wrong_qids[:, 0]
qa_df['wrong_qid_2'] = wrong_qids[:, 1]

In [14]:
qa_df.to_csv('healthtap_qa_pairs.csv', index=False)