In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

In [None]:
def search_pubmed(query, max_results=1000):
    ### API ENDPOINT DIRECTLY DERIVED FROM URL: https://www.ncbi.nlm.nih.gov/books/NBK25500/#chapter1.Searching_a_Database
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    # search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax={max_results}&usehistory=y"
    search_url = f"{base_url}esearch.fcgi?db=pmc&term={query}+AND+open+access[filter]&retmax={max_results}&usehistory=y"
    ### END OF API ENDPOINT DIRECTLY DERIVED FROM URL

    print("Initiate paper-ids fetch...")
    response = requests.get(search_url)
    print("Completed id fetch.")
    root = ET.fromstring(response.content)

    id_list = [id_elem.text for id_elem in root.findall(".//Id")]
    print("Total IDs: ", len(id_list))

    return id_list

def fetch_paper_details(id_list, fetch_full_text = False):
    ### API ENDPOINT DIRECTLY DERIVED FROM URL: https://www.ncbi.nlm.nih.gov/books/NBK25500/#chapter1.Searching_a_Database
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    fetch_url = f"{base_url}efetch.fcgi?db=pmc&id={','.join(id_list)}&retmode=xml"
    ### END OF API ENDPOINT DIRECTLY DERIVED FROM URL

    print("Start fetching papers. Total request count: ", len(id_list))
    response = requests.get(fetch_url)

    root = None
    if response and response.status_code == 200:
        print("Completed request")
        root = ET.fromstring(response.content)

    else:
        print(response.status_code)

    articles = root.findall(".//article")
    print("Total article count: ", len(articles))
    papers = []
    if root and articles:
        for article in tqdm(articles):
            pmid = article.find(".//article-meta/article-id[@pub-id-type='pmid']")
            if pmid is not None:
                pmid = pmid.text if pmid.text else None
            pmcid = article.find(".//article-meta/article-id[@pub-id-type='pmc']")
            if pmcid is not None:
                pmcid = pmcid.text if pmcid.text else None


            title = article.find(".//article-title").text

            # if not type(title) == type(""):
            #     title = title.text

            abstract = article.find(".//abstract")
            abstract = " ".join(list(abstract.itertext())) if abstract else None

            fulltext = "\n".join([str(e.text) for e in find_tags(article.findall(".//body"), {'title', 'p'}) if e.text])

            methods_materials = None
            results = None
            discussion = None
            conclusion = None
            for sec in article.findall(".//body//sec"):
                if "sec-type" in sec.attrib.keys():
                    if "methods" in sec.attrib["sec-type"] or "material" in sec.attrib["sec-type"]:
                        sectitle = sec.find(".//title")
                        if sectitle is not None:
                            sectitle = sectitle.text if sectitle.text else ""
                        methods_materials =  str(sectitle) + "\n" + " ".join([str(p.text) for p in sec.findall(".//p") if p.text is not None])
                    if "results" in sec.attrib["sec-type"]:
                        sectitle = sec.find(".//title")
                        if sectitle is not None:
                            sectitle = sectitle.text if sectitle.text else ""
                        results = str(sectitle) + "\n" + " ".join([str(p.text) for p in sec.findall(".//p") if p.text is not None])
                    if "discussion" in sec.attrib["sec-type"]:
                        sectitle = sec.find(".//title")
                        if sectitle is not None:
                            sectitle = sectitle.text if sectitle.text else ""
                        discussion = str(sectitle) + "\n" + " ".join([str(p.text) for p in sec.findall(".//p") if p.text is not None])
                    if "conclusion" in sec.attrib["sec-type"]:
                        sectitle = sec.find(".//title")
                        if sectitle is not None:
                            sectitle = sectitle.text if sectitle.text else ""
                        conclusion = str(sectitle) + "\n" + " ".join([str(p.text) for p in sec.findall(".//p") if p.text is not None])

            author_cotrib = article.find(".//contrib-group")
            if author_cotrib:
                authors = ", ".join([" ".join(name.itertext()) for name in author_cotrib.findall(".//contrib/name")])

            papers.append({
                "pmid": pmid,
                "pmcid": pmcid,
                "title": title,
                "abstract": abstract,
                "authors": authors,
                "methods_materials": methods_materials,
                "results": results,
                "conclusion": conclusion,
                "fulltext": fulltext
            })

    return papers

def find_tags(element, tags):
    result = []
    for child in element:
        if child.tag in tags:
            result.append(child)
        result.extend(find_tags(child, tags))
    return result


# Rough Testing

In [None]:
query = "Alzheimer's Disease Biomarkers"
# id_list = search_pubmed(query, max_results=1000)
# chunks = np.array_split(id_list, 3)

base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
# search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax={max_results}&usehistory=y"
search_url = f"{base_url}esearch.fcgi?db=pmc&term={query}+AND+open+access[filter]&retmax={20}&usehistory=y"

response = requests.get(search_url)
root = ET.fromstring(response.content)
id_list = [id.text for id in root.findall(".//Id")]

In [None]:
search_url = f"{base_url}efetch.fcgi?db=pmc&id={','.join(chunks[0])}&retmode=xml"
response = requests.get(search_url)

In [None]:
root = ET.fromstring(response.content)

In [None]:

# articles = root.findall(".//article")
# for article in tqdm(articles):
    # pmid = article.find(".//article-meta/article-id[@pub-id-type='pmid']")
    # pmid = pmid.text if pmid.text else None
    # pmcid = article.find(".//article-meta/article-id[@pub-id-type='pmc']")
    # pmcid = pmcid.text if pmcid else None

    # title = article.find(".//article-title").text


    # abstract = article.find(".//abstract")
    # abstract = " ".join(list(abstract.itertext())) if abstract else None
    # fulltext = "\n".join([str(e.text) for e in find_tags(article.findall(".//body"), {'title', 'p'}) if e.text])


In [None]:
# Search for papers
query = "Alzheimer's Disease Biomarkers"
id_list = search_pubmed(query, max_results=1000)
chunks = np.array_split(id_list, 3)
df = None

# Fetch paper details
for chunk in chunks:
    papers = fetch_paper_details(chunk)
    if len(chunk) != 0 and len(papers) == 0:
        print("Retrying scrape")
        time.sleep(3)
        papers = fetch_paper_details(chunk)
    print(f"Starting merge, papers scraped: {len(papers)}")
    df = pd.concat([df, pd.DataFrame(papers)])
    time.sleep(3)
df.reset_index(drop=True, inplace=True)


Initiate paper-ids fetch...
Completed id fetch.
Total IDs:  1000
Start fetching papers. Total request count:  334
Completed request
Total article count:  334


100%|██████████| 334/334 [00:00<00:00, 1061.81it/s]


Starting merge, papers scraped: 334
Start fetching papers. Total request count:  333
Completed request
Total article count:  333


100%|██████████| 333/333 [00:00<00:00, 1140.44it/s]


Starting merge, papers scraped: 333
Start fetching papers. Total request count:  333
Completed request
Total article count:  333


100%|██████████| 333/333 [00:00<00:00, 444.81it/s]


Starting merge, papers scraped: 333


In [None]:
df

Unnamed: 0,pmid,pmcid,title,abstract,authors,methods_materials,results,conclusion,fulltext
0,39377039,11458320,Advancements and Innovative Strategies in Indu...,The effectiveness and safety of mesenchymal st...,"Shi Xiaoyu, Zhang Kun, Yu Fengshi, Qi Qi, Cai ...",,,,1. Introduction\nRegenerative medicine and cel...
1,39374157,11457879,Neuroprotective effects of punicalagin and/or ...,"Abstract Background Manganism, a central nervo...","Abu‐Elfotuh Karema, Abbas Ashwaq N., Najm Mazi...",Supporting information\n\nData S1.\n,"RESULTS\nIn this work, three separate normal c...","CONCLUSIONS\nIn conclusion, as shown in the gr...",INTRODUCTION\nManganese is a vital trace eleme...
2,38869924,11457624,Impact of a keto diet on symptoms of Parkinson...,Aim: Evidence suggests low-carbohydrate diets...,"Tidman Melanie M, White Dawn Reid, White Tim A",Supplementary Material\n,Results\nFasting blood work to assess common b...,Conclusion\nIn this 24-week dietary interventi...,Background\nParkinson's disease (PD) is the nu...
3,39363536,11457610,Role of RNA polymerase III transcription and r...,ABSTRACT Ischaemic stroke is a leading cause o...,"Tsang Chi Kwan, Zheng X.F. Steven",,,,"Introduction\nStroke, defined as prolonged acu..."
4,39308280,11457211,Investigating patient eligibility for anti-amy...,Background Pharmacological treatment options f...,"Defrancesco Michaela, Gizewski Elke R., Manges...","Method\nThis was a retrospective, observationa...",Results\nA total of 587 out-patients attended ...,,"Since the approval of donepezil in 1997, pharm..."
...,...,...,...,...,...,...,...,...,...
995,39328348,11424528,Optimal dose and type of exercise to improve c...,Background Mild cognitive impairment (MCI) rep...,"Yu Yingying, Wang Junjie, Xu Jian",,"Results\nAs a result of the search strategy, 8...",Conclusion\nThis network meta-analysis has sho...,Introduction\nElderly people with MCI\nAs the ...
996,39328407,11424418,Identification of cellular senescence-related ...,Background Intervertebral disc degeneration (I...,"Wang Muyi, Wang Hao, Wang Xin, Shen Yifei, Zho...",Materials and methods\nThe transcriptome profi...,Results\nAfter conducting PCA analysis on the ...,,"Introduction\nLow back pain (LBP), a prevalent..."
997,39323550,11423858,The roles of TAF1 in neuroscience and beyond,The transcriptional machinery is essential for...,"Crombie Elisa M., Cleverley Karen, Timmers H. ...",,,,Introduction\nTATA box binding-protein (TBP)-a...
998,39323903,11423842,Phase-Amplitude Coupling in Theta and Beta Ban...,Background Phase-amplitude coupling (PAC) betw...,"Zhang Chan, Wang Yanhui, Li Mengjie, Niu Pengp...",,,,Introduction\nObstructive sleep apnea (OSA) a ...


In [None]:
df.to_csv("/content/alzheimer_biomarker_1000.csv", index=False)

In [None]:
# Search for papers
query = "Alzheimer's Disease Biomarkers"
id_list = search_pubmed(query, max_results=2000)
chunks = np.array_split(id_list[1000:], 3)
df = None

# Fetch paper details
for chunk in chunks:
    papers = fetch_paper_details(chunk)
    if len(chunk) != 0 and len(papers) == 0:
        print("Retrying scrape")
        time.sleep(3)
        papers = fetch_paper_details(chunk)
    print(f"Starting merge, papers scraped: {len(papers)}")
    df = pd.concat([df, pd.DataFrame(papers)])
    time.sleep(3)
df.reset_index(drop=True, inplace=True)

Initiate paper-ids fetch...
Completed id fetch.
Total IDs:  2000
Start fetching papers. Total request count:  334
Completed request
Total article count:  334


100%|██████████| 334/334 [00:00<00:00, 790.40it/s]


Starting merge, papers scraped: 334
Start fetching papers. Total request count:  333
Completed request
Total article count:  333


100%|██████████| 333/333 [00:00<00:00, 520.43it/s]


Starting merge, papers scraped: 333
Start fetching papers. Total request count:  333
Completed request
Total article count:  333


100%|██████████| 333/333 [00:00<00:00, 490.17it/s]


Starting merge, papers scraped: 333


In [None]:
df

Unnamed: 0,pmid,pmcid,title,abstract,authors,methods_materials,results,conclusion,fulltext
0,39324129,11423800,Fast quantitative MRI: Spiral Acquisition Matc...,\n Conventional diagnostic images from...,\n Perera-Gonzalez \n ...,Supplementary Material\n,,,Introduction\nMagnetic Resonance Imaging (MRI)...
1,38973296,11423104,DCAF7 Acts as A Scaffold to Recruit USP10 for ...,Abstract Despite docetaxel combined with cispl...,"Li Qing‐Jie, Fang Xue‐Liang, Li Ying‐Qin, Lin ...",Supporting information\nSupporting Information,,,"Introduction\nNasopharyngeal carcinoma (NPC), ..."
2,38851294,11422792,Opening the doors of precision medicine: novel...,Abstract Mounting evidence underscores the piv...,"Iacucci Marietta, Santacroce Giovanni, Majumde...",supplementary material\n,,,WHAT IS ALREADY KNOWN ON THIS TOPIC\nThe intes...
3,39328457,11422626,"Evaluation of the cognitive, physiological, an...",Background and Aim: Individuals exposed to hea...,"Mukhi Senna, Manjrekar Poornima Ajay, Srikanti...",Materials and Methods\nThis study was approved...,Results\nThe escape latency and time spent in ...,Conclusion\nThe evaluation revealed the comple...,Introduction\nThe possible health hazards asso...
4,39318060,11422451,Clinical signature and associated immune metab...,Abstract Inflammations have been linked to tum...,"Liao Yong, Yang Pinglian, Yang Cui, Zhuang Kai...",Supporting information\n\nData S1.\n,RESULTS\nNLRP1 expression pattern in different...,CONSENT\nNot applicable.,INTRODUCTION\nAccording to the statistics from...
...,...,...,...,...,...,...,...,...,...
995,39157732,11328858,Nanomedicines Targeting Ferroptosis to Treat S...,"Abstract Ferroptosis, a unique form of regulat...","Kang Hao, Meng Fansu, Liu Fengjie, Xie Mengjie...",,,,Introduction\nCell death is crucial for mainta...
996,39134987,11321064,Amniotic fluid proteomic analysis identifies I...,"Background Despite extensive research, the ide...","Li Min, Liu Mengmei, Chen Ping, Duan Sifan, Li...",Electronic supplementary material\nBelow is th...,Results\nA total of 44 pregnant women were inc...,"Conclusions\nIn conclusion, our study indicate...","Background\nPreterm birth (PTB), defined as de..."
997,39135397,11320689,Aesthetic Radiofrequency Associated with,The objective were to evaluate the effects of ...,"Tremêa Greissi Tatieli Franke, Kleibert Karine...","Methods\nThis was an experimental, longitudina...","Results\nThe study had 32 participants, sociod...",,Introduction\nRadiofrequency devices promote h...
998,39133488,11320167,Anti-Inflammatory Diet and Dementia in Older A...,Key Points Question Can an anti-inflammatory d...,"Dove Abigail, Dunk Michelle M., Wang Jiao, Guo...",,,,"Introduction\nCardiometabolic diseases (CMDs),..."


In [None]:
df.to_csv("/content/alzheimer_biomarker_2000.csv", index=False)

In [None]:
!pip install -qU \
  pinecone-client \
  sentence-transformers==2.2.2 \
  langchain \
  openai \
  cohere \
  datasets



[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.7/383.7 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.2/249.2 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m29.6 MB/s[0m eta [3

In [None]:
from pinecone import Pinecone, ServerlessSpec
import cohere
import os
from typing import List, Dict
import time

from datasets import load_dataset

In [None]:
co = cohere.Client(COHERE_API_KEY)

In [None]:
papers1 = pd.read_csv("/content/alzheimer_biomarker_1000.csv")
papers2 = pd.read_csv("/content/alzheimer_biomarker_2000.csv")

In [None]:
pc = Pinecone(PINECONE_API_KEY)

In [None]:
index_name = "scientific-papers"
if index_name not in pc.list_indexes().names():
    pc.create_index(index_name, dimension=1024, metric="cosine",
                    spec=ServerlessSpec(
                            cloud="aws",
                            region="us-east-1")
                    )

index = pc.Index(index_name)


In [None]:
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
# embed = papers1["fulltext"][:3].tolist()

# meta = [{
#     "id": i,
#     "paper_pmid": paper['pmid'],
#     "paper_pmic": paper['pmcid'],
#     "title": paper['title'],
#     "abstract": paper['abstract'],
#     "authors": paper['authors'],
#     "methods_materials": paper["methods_materials"],
#     "results": paper["results"],
#     "conclusion": paper["conclusion"]
#     } for i,paper in papers1.iloc[:3].iterrows()
# ]

# list(zip(meta, embed))

In [None]:
def process_and_store_papers(papers_df, embedding=None):
    ### START OF CODE FROM EXTERNAL SOURCE https://docs.pinecone.io/guides/get-started/quickstart
    index_name = "scientific-papers"
    if index_name not in pc.list_indexes().names():
        pc.create_index(index_name, dimension=1024, metric="cosine",
                        spec=ServerlessSpec(
                                cloud="aws",
                                region="us-east-1"))

    index = pc.Index(index_name)
    print(f"Chosen index: {index_name}")

    vc = index.describe_index_stats()["total_vector_count"]

    # ids = ["0", "1", ...]
    ids = [str(i) for i in range(vc, len(papers_df)+vc)]

    # metadata = [{meta: data}, {meta: data}, ...]
    metadata = [{"paper_pmid": paper['pmid'],
                "paper_pmic": paper['pmcid'],
                "title": paper['title'],
                "conclusion": paper["conclusion"]
                    } for i,paper in papers_df.iterrows()]

    if embedding is None:
        # pappers = [paperfulltext, paperfulltext, ...]
        papers = papers_df["fulltext"].tolist()

        embedding = co.embed(
                texts=papers,
                model="embed-english-v3.0",
                input_type="search_document"
            ).embeddings

    upsert_data = list(zip(ids,embedding,metadata))
    print("Upsert data created")

    batch_size = 128
    failed_idx = []
    shape = np.array(embedding).shape
    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i+batch_size, shape[0])
        try:
            index.upsert(vectors=upsert_data[i:i_end])
        except:
            print(f"index: [{i}: {i_end}] upserting unsuccessfull]")
            failed_idx.append((i,i_end))

    print(index.describe_index_stats())

    if failed_idx != []:
        return upsert_data, failed_idx

    return [], []

def upsert_failed_indexes(upsert_data, failed_idx):
    index_name = "scientific-papers"
    if index_name not in pc.list_indexes().names():
        pc.create_index(index_name, dimension=1024, metric="cosine",
                        spec=ServerlessSpec(
                                cloud="aws",
                                region="us-east-1"))

    index = pc.Index(index_name)
    print(f"Chosen index: {index_name}")

    refailed_batch = []

    for idxs in failed_idx:
        start, end = idxs
        for i in tqdm(range(start, end+1)):
            try:
                index.upsert(vectors=upsert_data[i])
            except:
                print(f"index: {i} upserting unsuccessfull")
                refailed_batch.append(i)

    print(index.describe_index_stats())
    ### END OF CODE FROM EXTERNAL SOURCE https://docs.pinecone.io/guides/get-started/quickstart

    return refailed_batch




def create_cohere_embeddings(papers_df):
    ### START OF CODE FROM EXTERNAL SOURCE https://docs.cohere.com/v2/reference/embed
    # pappers = [paperfulltext, paperfulltext, ...]
    papers = papers_df["fulltext"].tolist()

    embedding = []
    batch_size = 100

    for i in tqdm(range(0, len(papers), batch_size)):

        i_end = min(i+batch_size, len(papers))
        # DOCUMENTATION CODE https://docs.cohere.com/v2/reference/embed
        embedding = embedding + co.embed(
                texts=papers[i:i_end],
                model="embed-english-v3.0",
                input_type="search_document"
            ).embeddings
        # END OF DOCUMENTATION CODE

        time.sleep(60)
    ### END OF CODE FROM EXTERNAL SOURCE https://docs.cohere.com/v2/reference/embed

    return embedding

In [None]:
embedding = create_cohere_embeddings(papers1.dropna(subset=["fulltext"]))

100%|██████████| 10/10 [10:39<00:00, 63.94s/it]


In [None]:
np.save(arr=np.array(embedding), file="dataset1_999.npy")

In [None]:
embedding2 = create_cohere_embeddings(papers2.dropna(subset=["fulltext"]))

100%|██████████| 10/10 [10:44<00:00, 64.46s/it]


In [None]:
np.save(arr=np.array(embedding2), file="dataset2_998.npy")

In [None]:
upd, failed_idx = process_and_store_papers(papers1.dropna(subset=["fulltext"]).astype(str), embedding)

Chosen index: scientific-papers
Upsert data created


100%|██████████| 8/8 [00:09<00:00,  1.18s/it]

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 999}},
 'total_vector_count': 999}





In [None]:
upd, failed_idx = process_and_store_papers(papers2.dropna(subset=["fulltext"]).astype(str), embedding2)

Chosen index: scientific-papers
Upsert data created


100%|██████████| 8/8 [00:07<00:00,  1.05it/s]

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2023}},
 'total_vector_count': 2023}





In [None]:
# _ = upsert_failed_indexes(upd, failed_idx)

In [None]:
# start, end = failed_idx[0]
# print(f"{start} to {end} were not uploaded")

In [None]:
example_kw = [
    "Alzheimer's disease biomarkers",
    "Early detection of Alzheimer's",
    "Blood-based biomarkers for Alzheimer's",
    "Amyloid PET imaging in Alzheimer's",
    "Tau protein as Alzheimer's biomarker",
    "Cerebrospinal fluid biomarkers in Alzheimer's",
    "Genetic risk factors for Alzheimer's",
    "Neurofilament light chain in Alzheimer's",
    "Lipid biomarkers for Alzheimer's",
    "Alzheimer's disease progression markers",
    "Non-invasive Alzheimer's diagnostic tools",
    "Molecular imaging in Alzheimer's diagnosis",
    "Cognitive decline markers in Alzheimer's",
    "Predictive biomarkers for Alzheimer's",
    "Alzheimer's disease neuroimaging initiative",
    "Proteomics in Alzheimer's research",
    "Fluid biomarkers for Alzheimer's",
    "Eye and vision changes in Alzheimer's",
    "Sleep disturbances as Alzheimer's biomarkers",
    "Standardization of Alzheimer's biomarkers",
]

In [None]:
def retrieve_similar_articles(keyword: str, k: int = 5):
    index_name = "scientific-papers"
    index = pc.Index(index_name)
    print(f"Opened index {index_name}")

    # DOCUMENTATION CODE https://docs.cohere.com/v2/reference/embed
    query_embedding = co.embed(
        texts=[keyword],
        model="embed-english-v3.0",
        input_type="search_query"
    ).embeddings
    # END OF DOCUMENTATION CODE https://docs.cohere.com/v2/reference/embed

    ### DOCUMENTATION CODE https://docs.pinecone.io/guides/get-started/quickstart
    results = index.query(vector=query_embedding, top_k=k, include_metadata=True)
    ### END OF DOCUMENTATION CODE

    similar_articles = []
    for match in results['matches']:
        similar_articles.append({
            "paper_id": match['id'],
            "pmcid": match["metadata"]["paper_pmic"],
            "pmid": match["metadata"]["paper_pmid"],
            "title": match['metadata']['title'],
            "score": match['score']
        })

    return similar_articles


In [None]:
keyword = "Alzheimer's disease biomarkers"
similar_articles = retrieve_similar_articles(keyword, k=5)
for article in similar_articles:
    print(f"Paper ID: {article['paper_id']}")
    print(f"Title: {article['title']}")
    print(f"Similarity Score: {article['score']}")
    print("---")

Opened index scientific-papers
Paper ID: 234
Title: From Organotypic Mouse Brain Slices to Human Alzheimer Plasma Biomarkers: A Focus on Microglia
Similarity Score: 0.738853335
---
Paper ID: 958
Title: Peptide-Bound Glycative, AGE and Oxidative Modifications as Biomarkers for the Diagnosis of Alzheimer’s Disease—A Feasibility Study
Similarity Score: 0.725872576
---
Paper ID: 391
Title: Functional Nanomaterials for the Diagnosis of Alzheimer’s Disease: Recent Progress and Future Perspectives
Similarity Score: 0.716538608
---
Paper ID: 977
Title: Blood-based biomarkers in the oldest old: towards Alzheimer's disease detection in primary care
Similarity Score: 0.713781536
---
Paper ID: 1756
Title: Performance of plasma p-tau217 for the detection of amyloid-β positivity in a memory clinic cohort using an electrochemiluminescence immunoassay
Similarity Score: 0.706662
---


In [None]:
keyword = "Genetic risk factors for Alzheimer's"
similar_articles = retrieve_similar_articles(keyword, k=5)
for article in similar_articles:
    print(f"Paper ID: {article['paper_id']}")
    print(f"Title: {article['title']}")
    print(f"Similarity Score: {article['score']}")
    print("---")

Opened index scientific-papers
Paper ID: 56
Title: Exploring first-degree family history in a cohort of Portuguese Alzheimer’s disease patients: population evidence for X-chromosome linked and recessive inheritance of risk factors
Similarity Score: 0.724336207
---
Paper ID: 556
Title: Body Composition and Alzheimer’s Disease: A Holistic Review
Similarity Score: 0.719978154
---
Paper ID: 40
Title: Connecting dementia risk loci to the CSF proteome identifies pathophysiological leads for dementia
Similarity Score: 0.708768308
---
Paper ID: 1562
Title: Impact of Apolipoprotein E ε4 in Alzheimer’s Disease: Insights From a Meta-Analysis
Similarity Score: 0.706729531
---
Paper ID: 127
Title: Coupling of Alzheimer’s Disease Genetic Risk Factors with Viral Susceptibility and Inflammation
Similarity Score: 0.698289931
---
