## XML to Vector DB

- This file contains code for storing this XML Dataset contents in Vector DB, Pickle Files
- Using Chroma DB in langchain to achieve this

### Note
* **No need to run this file Yourself, as I have already ran this file and created  the Vector Database directory, Pickle files.**

In [7]:
import os
import xml.etree.ElementTree as ET
import pickle
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

## Parsing XML

- Get all the xml filepaths
- Parse each xml filepath

In [8]:
all_files = []
for dir, subdirs, files in os.walk('MedQuAD'):
    for file in files:
        if file.endswith('.xml'):
            all_files.append(dir+'\\'+file)
all_files

['MedQuAD\\10_MPlus_ADAM_QA\\0000001.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000002.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000003.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000004.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000005.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000006.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000007.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000008.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000009.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000010.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000011.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000012.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000013.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000014.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000015.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000016.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000017.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000018.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000019.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000020.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000021.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000022.xml',
 'MedQuAD\\10_MPlus_ADAM_QA\\0000023.xml',
 'MedQuAD\\

In [12]:
questions = []
answers = []
qa_pairs = {}
tags = {}


for path in all_files:
    # parse the path
    root = ET.parse(path)

    # get all question, answer pairs from the file
    for pair in root.findall('QAPairs/QAPair'):
        if pair.find('Answer').text is not None:
            q, a = pair.find('Question').text, pair.find('Answer').text
            if q not in qa_pairs:
                qa_pairs[q] = []
                tags[q] = []
            qa_pairs[q].append(a)
            tags[q].append(a)


# Store all the question/answer pairs in pickle file
with open('pickle_files/qa_pairs.pkl', 'wb') as file:
    pickle.dump(qa_pairs, file)

with open('pickle_files/tags.pkl', 'wb') as file:
    pickle.dump(tags, file)


## Chroma DB

In [3]:
with open('qa_pairs.pkl', 'rb') as file:
    qa_pairs = pickle.load(file)

len(qa_pairs)

14979

In [4]:
# CHROMA DB
# The vector Database contents are stored in vecdb_contents directory
vector_store = Chroma(
    collection_name='new_collection',
    embedding_function=HuggingFaceEmbeddings(),
    persist_directory='vecdb_contents'
)

# Create documents
docs = [Document(ques, id=str(i)) for i, ques in enumerate(qa_pairs.keys())]

In [5]:
docs[:5]

[Document(id='0', metadata={}, page_content='What is (are) Adult Acute Lymphoblastic Leukemia ?'),
 Document(id='1', metadata={}, page_content='What are the symptoms of Adult Acute Lymphoblastic Leukemia ?'),
 Document(id='2', metadata={}, page_content='How to diagnose Adult Acute Lymphoblastic Leukemia ?'),
 Document(id='3', metadata={}, page_content='What is the outlook for Adult Acute Lymphoblastic Leukemia ?'),
 Document(id='4', metadata={}, page_content='Who is at risk for Adult Acute Lymphoblastic Leukemia? ?')]

## Add all the questions to vector DB

In [6]:
vector_store.add_documents(docs[:5000])
vector_store.add_documents(docs[5000:10000])
vector_store.add_documents(docs[10000:])

['10000',
 '10001',
 '10002',
 '10003',
 '10004',
 '10005',
 '10006',
 '10007',
 '10008',
 '10009',
 '10010',
 '10011',
 '10012',
 '10013',
 '10014',
 '10015',
 '10016',
 '10017',
 '10018',
 '10019',
 '10020',
 '10021',
 '10022',
 '10023',
 '10024',
 '10025',
 '10026',
 '10027',
 '10028',
 '10029',
 '10030',
 '10031',
 '10032',
 '10033',
 '10034',
 '10035',
 '10036',
 '10037',
 '10038',
 '10039',
 '10040',
 '10041',
 '10042',
 '10043',
 '10044',
 '10045',
 '10046',
 '10047',
 '10048',
 '10049',
 '10050',
 '10051',
 '10052',
 '10053',
 '10054',
 '10055',
 '10056',
 '10057',
 '10058',
 '10059',
 '10060',
 '10061',
 '10062',
 '10063',
 '10064',
 '10065',
 '10066',
 '10067',
 '10068',
 '10069',
 '10070',
 '10071',
 '10072',
 '10073',
 '10074',
 '10075',
 '10076',
 '10077',
 '10078',
 '10079',
 '10080',
 '10081',
 '10082',
 '10083',
 '10084',
 '10085',
 '10086',
 '10087',
 '10088',
 '10089',
 '10090',
 '10091',
 '10092',
 '10093',
 '10094',
 '10095',
 '10096',
 '10097',
 '10098',
 '10099',
