In [1]:
# Import libraries
import nltk
import spacy
import string # For punctuation removal later

# Load the small English spaCy model we downloaded
# We assign it to 'nlp' which is a common convention for the main spaCy object
nlp = spacy.load("en_core_web_sm")

# Example sentence relevant to our domain
text = "This 2023 study analyzes the outcomes of minimally invasive total hip arthroplasty in elderly patients."

# Print the original text
print("Original Text:")
print(text)


Original Text:
This 2023 study analyzes the outcomes of minimally invasive total hip arthroplasty in elderly patients.


In [3]:
# -----Tokenization Using NLTK-----

# Use NLTK recommend word tokenizer 
nltk_tokens = nltk.word_tokenize(text)

print("NLTK Tokens:")
print(nltk_tokens)
print("-" * 20) # Separator

# -----Tokenization Using spaCy-----

# Process the text with the loaded spaCy model ('nlp' object)
# This creates a 'doc' obj which contains tokens and other annotations
doc = nlp(text)

# Extract the text of each token from the spaCy doc object
# Using a list comprehension for conciseness
spacy_tokens = [token.text for token in doc]

print("spaCy Tokens:")
print(spacy_tokens)

NLTK Tokens:
['This', '2023', 'study', 'analyzes', 'the', 'outcomes', 'of', 'minimally', 'invasive', 'total', 'hip', 'arthroplasty', 'in', 'elderly', 'patients', '.']
--------------------
spaCy Tokens:
['This', '2023', 'study', 'analyzes', 'the', 'outcomes', 'of', 'minimally', 'invasive', 'total', 'hip', 'arthroplasty', 'in', 'elderly', 'patients', '.']


In [5]:
# -----Lowercasing-----

# Convert tokens from the spaCy list to lowercase
# (We'll stick with the spaCy tokens list for subsequent steps,
# as spaCy often provides more linguistic features later)
lowercase_tokens = [token.lower() for token in spacy_tokens]

print("Lowercase Tokens:")
print(lowercase_tokens)
print("-" * 20) # Separator

# -----Stopword Removal-----

# Import the list of English stopwords from NLTK
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) # Use set for faster lookups

# You can print the first few stopwords to see what they look like
# print("Sample Stopwords:", list(stop_words)[:10])

# Remove stopwords from the lowercase list
tokens_without_stopwords = [token for token in lowercase_tokens if token not in stop_words]

print("Tokens after Stopword Removal:")
print(tokens_without_stopwords)

Lowercase Tokens:
['this', '2023', 'study', 'analyzes', 'the', 'outcomes', 'of', 'minimally', 'invasive', 'total', 'hip', 'arthroplasty', 'in', 'elderly', 'patients', '.']
--------------------
Tokens after Stopword Removal:
['2023', 'study', 'analyzes', 'outcomes', 'minimally', 'invasive', 'total', 'hip', 'arthroplasty', 'elderly', 'patients', '.']


In [6]:
# --- Lemmatization (using spaCy's pre-processed doc) ---

# Remember the 'doc' object we created earlier?
# doc = nlp(text)
# It contains rich linguistic features, including lemmas.

# Let's extract the lemma for each token, BUT only if the token is:
# - NOT a stopword (token.is_stop == False)
# - NOT punctuation (token.is_punct == False)
# - Composed of alphabetic characters (token.is_alpha == True) - this also removes numbers like '2023'

lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]

print("Lemmatized Tokens (Meaningful Words):")
print(lemmatized_tokens)

# Compare this to the list we had after only removing stopwords:
# Previous list: ['2023', 'study', 'analyzes', 'outcomes', 'minimally', 'invasive', 'total', 'hip', 'arthroplasty', 'elderly', 'patients', '.']

Lemmatized Tokens (Meaningful Words):
['study', 'analyze', 'outcome', 'minimally', 'invasive', 'total', 'hip', 'arthroplasty', 'elderly', 'patient']


In [9]:
# Import the Entrez module from Biopython
from Bio import Entrez

# --- Configuration ---
my_email = "Your.Name@example.com" # Replace with your actual email
search_query = "total knee arthroplasty" # Easily change search term here
max_results_search = "10" # Easily change number of results here
database = "pubmed"

# --- Set email for NCBI ---
Entrez.email = my_email

# --- Perform the search (esearch) ---
print(f"Searching {database} for '{search_query}' (max {max_results_search} results)...")
handle = Entrez.esearch(db=database, term=search_query, retmax=max_results_search)
search_results = Entrez.read(handle)
handle.close()

# Extract the list of PubMed IDs (PMIDs)
pmid_list = search_results["IdList"]

print(f"Found {len(pmid_list)} PMIDs:")
print(pmid_list)

Searching pubmed for 'total knee arthroplasty' (max 10 results)...
Found 10 PMIDs:
['40156068', '40155982', '40155353', '40154583', '40153784', '40153477', '40153059', '40151962', '40151740', '40151712']


In [10]:
# --- Fetch details for the retrieved PMIDs (efetch) ---

# List to store the data for all papers
papers_data = []

if pmid_list:
    ids_to_fetch = ",".join(pmid_list)
    print(f"\nFetching details for {len(pmid_list)} PMIDs...")

    handle = Entrez.efetch(db=database, id=ids_to_fetch, rettype="abstract", retmode="xml")
    try:
        # Parse the XML data
        papers = Entrez.read(handle)
        # The actual list of articles is usually here:
        articles = papers.get('PubmedArticle', []) # Use .get for safety
    except Exception as e: # Catch broader exceptions during parsing
        print(f"Error parsing XML or reading handle: {e}")
        articles = [] # Ensure articles is an empty list on error
    finally:
        handle.close()

    print(f"Successfully parsed {len(articles)} articles.")

    # --- Loop through each article and extract data robustly ---
    for article_xml in articles:
        paper_info = {} # Dictionary to store info for this specific paper
        try:
            # Use .get() chains to safely access nested dictionary keys
            medline_citation = article_xml.get('MedlineCitation', {})
            article_details = medline_citation.get('Article', {})
            journal_info = article_details.get('Journal', {})
            journal_issue = journal_info.get('JournalIssue', {})
            pub_date = journal_issue.get('PubDate', {})

            # Extract data using .get() with default values ('N/A')
            paper_info['pmid'] = medline_citation.get('PMID', 'N/A')
            paper_info['title'] = article_details.get('ArticleTitle', 'N/A')

            # Abstract extraction needs care - might be list or dict
            abstract_section = article_details.get('Abstract', {})
            abstract_text_list = abstract_section.get('AbstractText', [])
            if isinstance(abstract_text_list, list) and len(abstract_text_list) > 0:
                 # Handle cases where abstract is split into sections
                 paper_info['abstract'] = "\n".join(map(str, abstract_text_list))
            elif isinstance(abstract_text_list, str): # Sometimes it's just a string
                 paper_info['abstract'] = abstract_text_list
            else:
                 paper_info['abstract'] = 'N/A' # Default if no abstract found


            paper_info['journal'] = journal_info.get('Title', 'N/A')
            # Handle year extraction complexity
            year = pub_date.get('Year')
            if not year:
                 medline_date = pub_date.get('MedlineDate', 'N/A') # Fallback
                 # Try to extract year from MedlineDate string (e.g., "2023 Jan-Feb")
                 import re
                 match = re.search(r'\b(19|20)\d{2}\b', medline_date)
                 year = match.group(0) if match else 'N/A'
            paper_info['year'] = year

            # Add other fields if needed (e.g., Authors)

            papers_data.append(paper_info) # Add the dictionary to our list

        except Exception as e:
            # Log error for a specific paper but continue the loop
            pmid_for_error = medline_citation.get('PMID', 'UNKNOWN_PMID')
            print(f"Error processing PMID {pmid_for_error}: {e}")
            # Optionally append partial data or skip

    print(f"\nSuccessfully extracted data for {len(papers_data)} papers.")

else:
    print("PMID list is empty, skipping fetch.")


# --- Display Data for the First Few Papers Extracted ---
if papers_data:
    print("\n--- Sample Extracted Data (First 2 Papers) ---")
    # Print the first 2 dictionaries from the list
    import json # Use json for pretty printing dictionaries
    for i, paper in enumerate(papers_data[:2]): # Loop through the first 2
         print(f"\nPaper {i+1}:")
         print(json.dumps(paper, indent=2)) # Pretty print the dictionary
else:
    print("No data was extracted.")


Fetching details for 10 PMIDs...
Successfully parsed 10 articles.

Successfully extracted data for 10 papers.

--- Sample Extracted Data (First 2 Papers) ---

Paper 1:
{
  "pmid": "40156068",
  "title": "The dual role of titanium particles in osteolysis: implications for gene therapy in prosthesis loosening.",
  "abstract": "Aseptic prosthesis loosening caused by wear particles is a major complication in patients with osteoarthritis following total joint replacement. Despite advancements in treatment, the underlying mechanisms remain poorly understood, and effective therapies are still lacking.\nIn this study, we investigated the effects of titanium particles on osteoclast and osteoblast differentiation through both in vitro and in vivo experiments.\nOur findings revealed that titanium particles not only promote the differentiation of RAW264.7 cells into osteoclasts and enhance the secretion of inflammatory factors but also inhibit the differentiation of BMSCs into osteoblasts and red

In [12]:
# Import the pandas library
import pandas as pd

# Convert the list of dictionaries 'papers_data' into a DataFrame
# Pandas handles this conversion directly and efficiently
papers_df = pd.DataFrame(papers_data)

# Display the first 5 rows of the DataFrame to check
print("\n--- DataFrame Head (First 5 Rows) ---")
print(papers_df.head())

# Display some basic info about the DataFrame (columns, data types, non-null counts)
print("\n--- DataFrame Info ---")
papers_df.info()


--- DataFrame Head (First 5 Rows) ---
       pmid                                              title  \
0  40156068  The dual role of titanium particles in osteoly...   
1  40155982  The comparison of three dimensional and two di...   
2  40155353  The Role of Health Psychology in Surgical Preh...   
3  40154583  Comparing Functional Recovery Between Total an...   
4  40153784  Convergent and Known-Groups Validity and Sensi...   

                                            abstract  \
0  Aseptic prosthesis loosening caused by wear pa...   
1  The purpose of this study was to compare three...   
2  Approximately 10%-34% of people experience chr...   
3  Both total knee arthroplasty (TKA) and unicomp...   
4  Subsequent to the COVID-19 pandemic in 2020, a...   

                                       journal  year  
0         European journal of medical research  2025  
1  Journal of orthopaedic surgery and research  2025  
2                         Musculoskeletal care  2025  
3      

In [None]:
# Define a filename for the CSV
csv_filename = "pubmed_sample_tka_10.csv"

# Save the DataFrame to the CSV file
# index=False prevents pandas from writing the DataFrame index as a column
papers_df.to_csv(csv_filename, index=False)

print(f"DataFrame with {len(papers_df)} papers saved to: {csv_filename}")