IMPORTING NECESSARY DEPENDENCIES

In [1]:
import openai
import langchain
import pinecone 
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI

In [2]:
import fitz

In [3]:
import sys

In [4]:
import os
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

PREPROCESSING AND CLEANING THE PDF FILES

In [5]:
def clean_paper(filepath, output_dir):
  doc = fitz.open(filepath)

  
  text = ""
  for page in doc:
    blocks = page.get_text("blocks")  
    for block in blocks:
      
      if block[1] not in ("FIGURE", "TABLE"):
        text += block[4] + "\n"  

  
  import re
  cleaned_text = re.sub(r"\(.*?\)", "", text)  
  stop_words = stopwords.words("english")
  tokens = [word.lower() for word in re.split(r"\W+", cleaned_text) if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  stemmer = PorterStemmer() 
  stemmed_tokens = [lemmatizer.lemmatize(stemmer.stem(token)) for token in tokens]

  
  filename, _ = os.path.splitext(os.path.basename(filepath))
  output_file = os.path.join(output_dir, filename + ".txt")

  
  with open(output_file, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

  print(f"Paper processed: {filepath}")


In [6]:
paper_dir = "original papers"  
output_dir = "output"
for filename in os.listdir(paper_dir):
  if filename.endswith(".pdf"):
    filepath = os.path.join(paper_dir, filename)
    clean_paper(filepath, output_dir)

Paper processed: original papers\15_Nazneen.pdf
Paper processed: original papers\1_Ramırez-Duque_.pdf
Paper processed: original papers\22_Ouss_ASD.pdf
Paper processed: original papers\Abbas_2018.pdf
Paper processed: original papers\Abbas_2020.pdf
Paper processed: original papers\Asd_Cry_patterns.pdf
Paper processed: original papers\carpenter2020 (1).pdf
Paper processed: original papers\Dawson.pdf
Paper processed: original papers\LEE.pdf
Paper processed: original papers\Patten_Audio.pdf
Paper processed: original papers\Qiu.pdf
Paper processed: original papers\Tariq2018.pdf
Paper processed: original papers\Tariq_2019.pdf
Paper processed: original papers\Young_Behavior.pdf
Paper processed: original papers\zhao2020.pdf


#CONCATENATED ALL TEXT FILES INTO ONE AND CONVERTED TO PDF EXTERNALLY

In [7]:
import glob

In [12]:
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [14]:
!pip install pypdf

Collecting pypdf
  Obtaining dependency information for pypdf from https://files.pythonhosted.org/packages/c9/d1/450b19bbdbb2c802f554312c62ce2a2c0d8744fe14735bc70ad2803578c7/pypdf-4.2.0-py3-none-any.whl.metadata
  Downloading pypdf-4.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
   ---------------------------------------- 0.0/290.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/290.4 kB ? eta -:--:--
   -- ------------------------------------ 20.5/290.4 kB 682.7 kB/s eta 0:00:01
   ------ -------------------------------- 51.2/290.4 kB 440.4 kB/s eta 0:00:01
   ---------- ---------------------------- 81.9/290.4 kB 512.0 kB/s eta 0:00:01
   -------------------- ----------------- 153.6/290.4 kB 766.6 kB/s eta 0:00:01
   -------------------------- ----------- 204.8/290.4 kB 831.5 kB/s eta 0:00:01
   -------------------------------------- 290.4/290.4 kB 997.2 kB/s eta 0:00:00
Installing collected packages: pypdf
Successfully inst

In [59]:
final = "final_output"
doc=read_doc(final)


In [60]:
len(doc)

122

BREAKING THE CORPUS INTO CHUNKS

In [122]:
def data_chunk(docs,chunk_size=1500,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs

In [63]:
documents=data_chunk(docs=doc)
len(documents)

122

IMPORTING LLM AND EMBEDDING MODEL

In [7]:
from openai import OpenAI
client = OpenAI(api_key='MY_KEY')

In [8]:
def get_embedding(text, model="text-embedding-3-small"):
    response = client.embeddings.create(input=[text], model=model).data[0].embedding
    return response

In [9]:
test_embedding = get_embedding("how are you?")

In [67]:
test_embedding

[0.025025203824043274,
 -0.02335367351770401,
 -0.01035752147436142,
 0.03197399526834488,
 -0.012954363599419594,
 -0.02094189263880253,
 0.03195011615753174,
 0.01533032488077879,
 -0.05138763040304184,
 -0.0467551052570343,
 -0.022649241611361504,
 -0.016870521008968353,
 -0.027604136615991592,
 0.004886242561042309,
 -0.010524674318730831,
 0.03622445836663246,
 -0.006536879111081362,
 -0.000730548519641161,
 0.01648845709860325,
 0.007521888241171837,
 0.020524010062217712,
 -0.007115945219993591,
 0.016046695411205292,
 0.016966037452220917,
 0.03147253766655922,
 0.00131110695656389,
 0.01679888367652893,
 0.04126293212175369,
 -0.009491906501352787,
 -0.03756168484687805,
 0.02676837146282196,
 -0.03837357088923454,
 0.031830720603466034,
 0.01322897244244814,
 0.011372379027307034,
 0.015509417280554771,
 -0.048331119120121,
 0.022768637165427208,
 0.010423188097774982,
 -0.055447064340114594,
 -0.026840008795261383,
 -0.042886704206466675,
 0.020882194861769676,
 0.0342425033

In [2]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="MY_KEY")

In [3]:
index_name="autism-rag-model"
index = pc.Index("autism-rag-model")

In [72]:
documents[0]

Document(page_content='Ouss et al. Translational Psychiatry   10:54   \nhttps://doi.org/10.1038/s41398-020-0743-8  \nTranslational Psychiatry  \n  \nA R T I C L E  \nO p e n A c c e s s  \n  \nBehavior and interaction imaging at 9 months of  \nage predict autism/intellectual disability in high-risk  \ninfants with West syndrome  \n  \nLisa Ouss1, Giuseppe Palestra  \n2, Catherine Saint-Georges2,3, Marluce Leitgel Gille1, Mohamed Afshar4,  \nHugues Pellerin2, Kevin Bailly2, Mohamed Chetouani2, Laurence Robel1, Bernard Golse1, Rima \nNabbout5,  \nIsabelle Desguerre5, Mariana Guergova-Kuras4 and David Cohen  \n2,3  \n  \nAbstract  \nAutomated behavior analysis are promising tools to overcome current assessment limitations in \npsychiatry. At  \n9 months of age, we recorded 32 infants with West syndrome  and 19 typically developing  controls during\na  \nstandardized mother±infant interaction. We computed infant hand movements , speech turn taking of \nboth  \npartners  and motherese. Then

In [78]:
a= get_embedding(str(documents[0]))

In [86]:
string = (str(documents[0])).replace("\n", "")
print(string)


page_content='Ouss et al. Translational Psychiatry   10:54   \nhttps://doi.org/10.1038/s41398-020-0743-8  \nTranslational Psychiatry  \n  \nA R T I C L E  \nO p e n A c c e s s  \n  \nBehavior and interaction imaging at 9 months of  \nage predict autism/intellectual disability in high-risk  \ninfants with West syndrome  \n  \nLisa Ouss1, Giuseppe Palestra  \n2, Catherine Saint-Georges2,3, Marluce Leitgel Gille1, Mohamed Afshar4,  \nHugues Pellerin2, Kevin Bailly2, Mohamed Chetouani2, Laurence Robel1, Bernard Golse1, Rima \nNabbout5,  \nIsabelle Desguerre5, Mariana Guergova-Kuras4 and David Cohen  \n2,3  \n  \nAbstract  \nAutomated behavior analysis are promising tools to overcome current assessment limitations in \npsychiatry. At  \n9 months of age, we recorded 32 infants with West syndrome  and 19 typically developing  controls during\na  \nstandardized mother±infant interaction. We computed infant hand movements , speech turn taking of \nboth  \npartners  and motherese. Then, we asse

In [79]:
a

[0.005101347807794809,
 0.017086267471313477,
 0.030622517690062523,
 0.04793968051671982,
 -0.034605465829372406,
 -0.007150545250624418,
 0.044851452112197876,
 0.02646639756858349,
 -0.02301739528775215,
 0.01953953318297863,
 0.004747788887470961,
 -0.0764264166355133,
 -0.029410315677523613,
 0.008579211309552193,
 -0.01574418693780899,
 0.02285865508019924,
 0.01105412282049656,
 -0.017721230164170265,
 0.005436867941170931,
 0.04537096619606018,
 0.016913095489144325,
 -0.001682109897956252,
 -0.006129554472863674,
 -0.025817004963755608,
 0.03552904725074768,
 -0.07209712266921997,
 0.0027274531312286854,
 -0.0026210248470306396,
 0.02396984025835991,
 0.05402955040335655,
 0.008947201073169708,
 -0.030016416683793068,
 -0.024806836619973183,
 -0.02639424242079258,
 -0.05475109815597534,
 0.017706800252199173,
 -0.02124238759279251,
 -0.0029583487194031477,
 -0.0337396077811718,
 -0.009899645112454891,
 0.024027563631534576,
 -0.041128262877464294,
 -0.004433915484696627,
 -0.0

CONVERTING THE CHUNKS INTO WORD EMBEDDINGS AND STORING IN PINECONE VECTORDB

In [88]:
id_counter = 0
for doc in documents:
  embedding = get_embedding(str(doc))
  index.upsert(vectors=[{"id": f"vec_{id_counter}", "values": embedding}])  # Specify namespace
  id_counter += 1

print("Document Embeddings Stored in Pinecone!")


  


Document Embeddings Stored in Pinecone!


IMPORTING THE QUESTION SET

In [4]:
df = pd.read_csv("Query Questions - Sheet1.csv")

<IPython.core.display.Javascript object>

In [5]:
questions=[]
for question in df['Questions ']:
    questions.append(question)

In [6]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

In [10]:
def retrieve_query(query1):
    xq=get_embedding(query1)
    matching_results=index.query(vector=[xq],top_k=5,include_metadata=True)
    return matching_results

IMPORTING LANGCHAIN TO INTEGRATE THE DATABASE AND THE LLM

In [11]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI

In [12]:
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate

In [13]:
llm=OpenAI(openai_api_key="MY_KEY",model_name="text-davinci-003",temperature=0.6)
chain=load_qa_chain(llm,chain_type="stuff")

In [28]:
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)


In [29]:
retrieve_answers(questions[1])

{'matches': [{'id': 'vec_67', 'score': 0.458019227, 'values': []},
             {'id': 'vec_92', 'score': 0.450936526, 'values': []},
             {'id': 'vec_51', 'score': 0.445084184, 'values': []},
             {'id': 'vec_98', 'score': 0.436692178, 'values': []},
             {'id': 'vec_29', 'score': 0.431463093, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}


In [15]:
questions

['What are the variety of Multimodal and Multi-modular AI Approaches to Streamline Autism Diagnosis in Young Children',
 'What is Autism Spectrum Disorder, how it is caysed?',
 'What is the cure of Autism Spectrum Disorder',
 'What are Stereotypical and maladaptive behaviors in Autism Spectrum, how are these detected and managed',
 'How relevant is eye contact and how it can be used to detect Autism',
 'How can cross country trials help in development of Machine learning based Multimodal solutions ',
 'How early infants cry can help in the early detection of Autism ',
 'What are various methods to detect  Atypical Pattern of Facial expression in Children ',
 'What kind of facial expressions can be used to detect Autism Disorder in children',
 'What are methods to detect Autism from home videos ',
 'What is Still-Face Paradigm in Early Screening for High-Risk Autism Spectrum Disorder',
 'What is West Syndrome? ',
 'What is the utility of Behavior and interaction imaging at 9 months of a

In [113]:
query = questions[1]

RETRIEVING RESPONSES FROM QUERIES

In [37]:
for i, question in enumerate(questions, start=1):
  print(f"The answer vectors to question {i} is:")
  print(retrieve_answers(question))
  print()  


The answer vectors to question 1 is:
{'matches': [{'id': 'vec_51', 'score': 0.659381509, 'values': []},
             {'id': 'vec_101', 'score': 0.625653684, 'values': []},
             {'id': 'vec_52', 'score': 0.623782277, 'values': []},
             {'id': 'vec_63', 'score': 0.604479432, 'values': []},
             {'id': 'vec_34', 'score': 0.594145536, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}
None

The answer vectors to question 2 is:
{'matches': [{'id': 'vec_67', 'score': 0.458019227, 'values': []},
             {'id': 'vec_92', 'score': 0.450936526, 'values': []},
             {'id': 'vec_51', 'score': 0.445084184, 'values': []},
             {'id': 'vec_98', 'score': 0.436692178, 'values': []},
             {'id': 'vec_29', 'score': 0.431463093, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}
None

The answer vectors to question 3 is:
{'matches': [{'id': 'vec_67', 'score': 0.509632707, 'values': []},
             {'id': 'vec_51', 'score': 0.50