# RAG

## 1. Load DATA

In [None]:
import pandas as pd
import transformers
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import evaluate
import numpy as np
import pandas as pd
import openpyxl
import os
from PyPDF2 import PdfReader
import glob

In [None]:
file_path = './data/quantitative_value_with_context_CC.xlsx'

# Read the "missing value" worksheet
missing_value_df = pd.read_excel(file_path, sheet_name='missing value')

# Read the 'quantitative_value_with context' worksheet
quantitative_value_df = pd.read_excel(file_path, sheet_name='quantitative_value_with context')
# quantitative_value_df = quantitative_value_df.rename(columns={'Relevant context for MOOD data extraction': 'context'})

missing_value_df['source'] = 'missing_value'
quantitative_value_df['source'] = 'quantitative_value_with_context'

# drop 3 lines corresponding to a Claudia's comment
"""
Missing values not found by the script. Articles: MB7, MB8, CC6-15-17-23-32
"""
missing_value_df = missing_value_df.drop(index=range(53, 56))

# concatenate the 2 dataframe
# df = pd.concat([missing_value_df, quantitative_value_df], ignore_index=True)
df = quantitative_value_df

usable_covariates = df[df["Relevant context for MOOD data extraction"].str.lower().isin(["yes", "Yes"])]
# usable_covariates = pd.concat([missing_value_df, usable_covariates])

full_text_annotation = usable_covariates[usable_covariates["Mood extraction from Table/Figure"].str.lower().isin(["no", "No"])]
table_annotations = usable_covariates[usable_covariates["Mood extraction from Table/Figure"].str.lower().isin(["Table", "table", "table and caption"])]
figure_annotations = usable_covariates[usable_covariates["Mood extraction from Table/Figure"].str.lower().isin(["figure", "Figure", "Figure caption"])]

In [None]:
df.head(1)

In [None]:
usable_covariates[["context"]].describe()


In [None]:
d = f'{full_text_annotation.iloc[0]["context"]} \t covariable found: {full_text_annotation.iloc[0]["covariate_found_in_text"]}: {(full_text_annotation.iloc[0]["non-standardized covariate in the context	"] if full_text_annotation.iloc[0]["covariate_found_in_text"] else "")}'
d

In [None]:
full_text_annotation["covariate_text"] = full_text_annotation.apply(lambda row: row["non-standardized covariate in the context"] if row["Relevant context for MOOD data extraction"] else "", axis=1)
full_text_annotation["docs"] = full_text_annotation.apply(lambda row: f'{row["context"]} \t covariable found: {row["covariate_found_in_text"]}: {row["covariate_text"]}', axis=1)

print(full_text_annotation["docs"].head(1))


In [None]:
docs = full_text_annotation["docs"].tolist()
print(f"Nb of docs: {len(docs)}")
docs

In [None]:
dataset_dict = DatasetDict.load_from_disk("./data/annotation_generated_from_xlsx/annotation.dataset")
dataset_dict

In [None]:
from difflib import SequenceMatcher
import unicodedata

train_docs_list = docs.copy()
test_docs_list = []

for i, sentence in enumerate(dataset_dict["test"]["tokens"]):
    s = " ".join(sentence)
    print(f"{i}: {s}")
    for doc in docs:
        doc_format = unicodedata.normalize("NFKD", doc.lstrip())
        if SequenceMatcher(None, doc_format.lower(), s.lower()).ratio() > 0.7:
            # print(f"{s.lower()} \n\t {doc_format.lower()}")
            # print("\n")
            # print(f"{s.lower()[0]} | {doc_format.lower()[0]}")
            if doc_format.lower()[0] == s.lower()[0]:
                print(f"\t found !: {doc}")
                train_docs_list.remove(doc)
                test_docs_list.append(doc)


In [None]:
len(train_docs_list)

## 2. Prepage RAG pipeline

In [None]:
#from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
#from langchain.chains import RetrievalQA

In [None]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs_for_rag = text_splitter.create_documents(train_docs_list)

db = FAISS.from_documents(docs_for_rag, embeddings)

In [None]:
retriever = db.as_retriever(search_kwargs={"k": 5})

## 3. RAG active

In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser
from langchain import hub
from langchain.chat_models import ChatOpenAI
from getpass import getpass

openai_api_key = getpass("OpenAI API Key: ")


In [None]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context as example to help to extract covariates or risk factors from the sentence only (don't extract from the context please). If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""
prompt_template = ChatPromptTemplate.from_template(template)

rag_chain_gpt4 = (
    {"context": retriever, "question": RunnablePassthrough()}
    # | hub.pull("rlm/rag-prompt")
    | prompt_template
    # | ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)
    | ChatOpenAI(model_name="gpt-4", temperature=0, openai_api_key=openai_api_key)
    | StrOutputParser()
)

In [None]:
sentence = """g10 (i.e. a 90% reduction in infectivity). These Rt values were based on the averages reported by Keeler et al. (2014) for resistance of 9 LPAIV strains in distilled, ï¬ltered and natural water (67.1, 3.1 and 30.0 days, respectively), and were also in accordance with previously published data (e.g. Stallknecht et al., 1991; Brown et al., 2009; Lebarbenchon et al., 2012 ). 2.7. The numerical model I"""

rag_chain_gpt4.invoke("Is there any covariate (or risk factor) in this following sentence: \n " + sentence)

In [None]:
results_list = []

for sentence in test_docs_list:
    res = rag_chain_gpt4.invoke("Is there any covariate (or risk factor) in this following sentence: \n " + sentence)
    results_list.append(res)

In [None]:
results_list

In [None]:
test_docs_list[5]

In [None]:
covariate_found = 0
for res in results_list:
    if res.lower().startswith("yes"):
        covariate_found += 1
print(f"Nb of covariate found: {covariate_found} | {100*covariate_found/len(test_docs_list)}")

In [None]:
for s, r in zip(test_docs_list, results_list):
    print(f"{s} \n \t{r}")

In [None]:
t = hub.pull("rlm/rag-prompt")
t

### Same with GPT3.5

In [None]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    # | hub.pull("rlm/rag-prompt")
    | prompt_template
    | ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)
    # | ChatOpenAI(model_name="gpt-4", temperature=0, openai_api_key=openai_api_key)
    | StrOutputParser()
)

In [None]:
results_list_gpt3_5 = []

for sentence in test_docs_list:
    res = rag_chain.invoke("Is there any covariate (or risk factor) in this following sentence: \n " + sentence)
    results_list_gpt3_5.append(res)

In [None]:
for s, r in zip(test_docs_list, results_list_gpt3_5):
    print(f"{s} \n \t{r}")

In [None]:
results_list_gpt3_5

## 4. Infer on all diseases

In [None]:
from grobid.tei import Parser
import os
import json

In [None]:
tei_files_path = "./data/grobid/"

tei_file_example = "./data/grobid/CC10 Aerosol Susceptibility of Influenza Virus to UV-C Light.pdf.tei.xml"

# list_of_diseases = ["chikungunya", "leptospirosi", "influenza" ]
list_of_diseases = ["influenza" ]
sections_to_keep = ["abstract", "results", "discussion"]
chunk_size = 256

list_of_base_models = ["GPT-3.5", "GPT-4"]
# list_of_base_models = ["GPT-4"]
# list_of_base_models = ["GPT-3.5"]


In [None]:
def chunk_relevant_sections(tei_file):
    chunks = []
    md = {
        "title": tei_file.split("/")[-1],
        "id": (tei_file.split("/")[-1]).split(" ")[0],
    }

    # load PDF into python dict
    with open(tei_file, "rb") as xml_file:
        xml_content = xml_file.read()
    parser = Parser(xml_content)
    article = parser.parse()
    article = json.loads(article.to_json())

    # work on abstract
    abstract = article["abstract"]
    for p, paragraph in enumerate(abstract["paragraphs"]):
        for i in range(0, len(paragraph["text"]), chunk_size):
            # print(f"{i} : {(i)} | {i + chunk_size -1}")
            chunk_with_md = md.copy()
            chunk_with_md["section"] = "abstract"
            chunk_with_md["paragraph_nb"] = p
            chunk_with_md["chunk_nb"] = i/chunk_size
            chunk_with_md["text"] = paragraph["text"][i:i+chunk_size-1]
            chunks.append(chunk_with_md)

    # work on usefull sections
    for s in article["sections"]:
        if (s["title"].lower() in sections_to_keep):
            for p, paragraph in enumerate(s["paragraphs"]):
                for i in range(0, len(paragraph["text"]), chunk_size):
                    # print(f"{i} : {(i)} | {i + chunk_size -1}")
                    chunk_with_md = md.copy()
                    chunk_with_md["section"] = s["title"]
                    chunk_with_md["paragraph_nb"] = p
                    chunk_with_md["chunk_nb"] = i/chunk_size
                    chunk_with_md["text"] = paragraph["text"][i:i+chunk_size-1]
                    chunks.append(chunk_with_md)
    
    return chunks

def extract_covariates(text):
    # Split the text into lines
    lines = text.split('\n')
    # Extract covariates using list comprehension
    covariates = [line.split(': ')[1].strip().rstrip('.') for line in lines if 'Covariate found:' in line or 'No covariate found:' in line]
    # Join the covariates into a single string
    return [', '.join(covariates)]


In [None]:
for disease in list_of_diseases:
    tei_files_path_disease = tei_files_path + "/" + disease

    list_of_chunk = []
    for doc in os.listdir(tei_files_path_disease):
        list_of_chunk.extend(chunk_relevant_sections( os.path.join(tei_files_path_disease, doc)))
    df = pd.DataFrame(list_of_chunk)

    for pretrained_model in list_of_base_models:
        print(f"Work on: {pretrained_model}")
        results_list_infer_gpt3_5 = []

        for sentence in df["text"]:
        # for sentence in df.iloc[0:3]["text"]:
            if pretrained_model == 'GPT-3.5':
                res = rag_chain.invoke("Is there any covariate (or risk factor) in this following sentence: \n " + sentence)
            else:
                res = rag_chain_gpt4.invoke("Is there any covariate (or risk factor) in this following sentence: \n " + sentence + "please provide only a python list like '[temperature, humidity] without any explanation'")
            results_list_infer_gpt3_5.append(res)
        
        df[pretrained_model] = pd.DataFrame(results_list_infer_gpt3_5)
        if pretrained_model == 'GPT-3.5':#need to parse because GPT-3.5 doest not follow well the instructions
            df[pretrained_model] = df[pretrained_model].apply(extract_covariates)
    
    df.to_csv(f"./data/whole_inference_llm_{disease}.csv")



In [None]:
# df[pretrained_model] = df[pretrained_model].apply(extract_covariates)
# df.to_csv(f"./data/whole_inference_llm_{disease}.csv")