Retrieve the articles from Pubmed

In [None]:
from Bio import Entrez

# Use your own email address
Entrez.email = "email@example.com"

# Specify the search term and the database to search
print("Enter disease name:")
term = input()
handle = Entrez.esearch(db="pubmed", term=term, retmax=10)

# the search results
search_results = Entrez.read(handle)
handle.close()

# Get the IDs of the articles
article_ids = search_results["IdList"]

# Get the articles
handle = Entrez.efetch(db="pubmed", id=article_ids, rettype="medline", retmode="json")
articles = handle.read()
handle.close()

# Print the articles
print(articles)

# Save the articles to a text file
with open("articles.txt", "w", encoding="utf-8") as file:
    file.write(articles)
print("Articles saved to articles.txt")

In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

Named Entity Recognition and Entity Extraction using Custom Patterns with spaCy

In [None]:
import spacy

# Load the English model
nlp = spacy.load('en_core_web_sm')

# Read the file with custom entity names
with open('approved_name.txt', 'r') as f:
    custom_entities = f.read().splitlines()

# Create a custom entity component and add it to the model pipeline
entity_matcher = nlp.add_pipe("entity_ruler", before="ner")
patterns = [{"label": "DISORDER", "pattern": ent} for ent in custom_entities]
entity_matcher.add_patterns(patterns)

# Define the function to extract entities
def get_entities(text, pmid):
    # Process the text with the model
    doc = nlp(text)
    
    # Extract entities with the label "DISORDER"
    entities = []
    for ent in doc.ents:
        if ent.label_ == "DISORDER":
            entities.append((ent.text, pmid))
    
    # Return the found entities
    return entities

# Load the text from a file
with open('raw_articles/articles_pervasive developmental disorder.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Extract entities from the text
chunk_size = 1000000 # number of characters in each chunk of text
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

all_entities = []
current_pmid = None
for chunk in chunks:
    # Split the text into articles using the PMID- delimiter
    articles = chunk.split("PMID- ")
    for article in articles:
        # Skip empty article
        if not article:
            continue
        
        # Extract the PMID from the article
        pmid = article.split("\n")[0]
        
        # Update the current PMID
        current_pmid = pmid
        
        # Extract entities from the article
        entities = get_entities(article, current_pmid)
        
        # Add the found entities to the list
        all_entities.extend(entities)

# Write the found entities to a text file
with open('get_entities_results_names/articles_pervasive developmental disorder1.txt', 'w') as f:
    for entity in all_entities:
        term, pmid = entity
        f.write(f"{term}\t{pmid}\n")