In [13]:
from Bio import Entrez
import pandas as pd
from Bio import Medline
import time
from urllib.error import HTTPError, URLError
import socket
from http.client import IncompleteRead
import requests

patients_df = pd.read_csv("data/PMC-Patients.csv")
unique_pmids = patients_df["PMID"].unique()

Entrez.email = "matteo.lotti3@edu.unifi.it"
pmid_list = unique_pmids.tolist()

batch_size = 500  # Ridotto da 1000 a 500
all_records = []
failed_batches = []

print(f"Fetching data for {len(pmid_list)} PMIDs from PubMed...")

for start in range(0, len(pmid_list), batch_size):
    end = min(start + batch_size, len(pmid_list))
    batch_pmids = pmid_list[start:end]
    print(f"Fetching PMIDs {start + 1} to {end}...")
    
    max_retries = 3
    for attempt in range(max_retries):
        try:
            # Aggiungi timeout
            handle = Entrez.efetch(
                db="pubmed", 
                id=",".join(map(str, batch_pmids)), 
                rettype="medline", 
                retmode="text",
                retmax=batch_size
            )
            
            records = list(Medline.parse(handle))
            all_records.extend(records)
            print(f"✓ Batch {start//batch_size + 1} successful: {len(records)} records")
            
            # Pausa tra le richieste
            time.sleep(1)
            break  # Esci dal loop di retry se ha successo
            
        except (HTTPError, URLError, socket.timeout, IncompleteRead) as e:
            print(f"✗ Attempt {attempt + 1} failed for batch {start//batch_size + 1}: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {2 ** attempt} seconds...")
                time.sleep(2 ** attempt)  # Backoff esponenziale
            else:
                print(f"❌ Batch {start//batch_size + 1} failed after {max_retries} attempts")
                failed_batches.append((start, end))
                continue

print(f"\n=== SUMMARY ===")
print(f"Fetched {len(all_records)} records from PubMed.")
print(f"Failed batches: {len(failed_batches)}")
print(f"Success rate: {len(all_records)/len(pmid_list)*100:.1f}%")

# Crea DataFrame SOLO se abbiamo record
if all_records:
    df = pd.DataFrame([
        {
            "PMID": rec.get("PMID", ""),
            "Title": rec.get("TI", ""),
            "Authors": "; ".join(rec.get("AU", [])),
            "Journal": rec.get("JT", ""),
            "Date": rec.get("DEP", ""),  # Data di pubblicazione elettronica
            "Abstract": rec.get("AB", ""),
            "DOI": rec.get("LID", "")
        }
        for rec in all_records
    ])
    
    # Gestione date più robusta
    def parse_date(date_str):
        if not date_str or date_str == "":
            return None
        try:
            # Prova diversi formati di data
            if len(date_str) == 8 and date_str.isdigit():
                return date_str
            else:
                return None
        except:
            return None
    
    df["Date"] = df["Date"].apply(parse_date)
    
    # Separa in colonne solo per date valide
    valid_dates = df["Date"].notna()
    df.loc[valid_dates, "Year"] = df.loc[valid_dates, "Date"].str[:4]
    df.loc[valid_dates, "Month"] = df.loc[valid_dates, "Date"].str[4:6]
    df.loc[valid_dates, "Day"] = df.loc[valid_dates, "Date"].str[6:8]
    
    print(f"\nRecords with valid dates: {valid_dates.sum()}")
    print(f"Records without dates: {(~valid_dates).sum()}")
    
    # Salva
    df.to_csv("data/pubmed_data.csv", index=False)
    print("Data saved to data/pubmed_data.csv")

    # Mostra anteprima
    print("\nFirst 10 rows:")
    print(df.head(10))
else:
    print("❌ No records were fetched!")



Fetching data for 140897 PMIDs from PubMed...
Fetching PMIDs 1 to 500...
✓ Batch 1 successful: 500 records
Fetching PMIDs 501 to 1000...
✓ Batch 2 successful: 500 records
Fetching PMIDs 1001 to 1500...
✓ Batch 3 successful: 500 records
Fetching PMIDs 1501 to 2000...
✓ Batch 4 successful: 499 records
Fetching PMIDs 2001 to 2500...
✓ Batch 5 successful: 500 records
Fetching PMIDs 2501 to 3000...
✓ Batch 6 successful: 499 records
Fetching PMIDs 3001 to 3500...
✓ Batch 7 successful: 500 records
Fetching PMIDs 3501 to 4000...
✓ Batch 8 successful: 500 records
Fetching PMIDs 4001 to 4500...
✓ Batch 9 successful: 500 records
Fetching PMIDs 4501 to 5000...
✓ Batch 10 successful: 500 records
Fetching PMIDs 5001 to 5500...
✓ Batch 11 successful: 500 records
Fetching PMIDs 5501 to 6000...
✓ Batch 12 successful: 500 records
Fetching PMIDs 6001 to 6500...
✓ Batch 13 successful: 499 records
Fetching PMIDs 6501 to 7000...
✓ Batch 14 successful: 500 records
Fetching PMIDs 7001 to 7500...
✓ Batch 15 su

In [14]:
# make sure PMIDs in df are numeric like patients_df
df['PMID'] = pd.to_numeric(df['PMID'], errors='coerce').dropna().astype(int)

print("patients_df PMID dtype:", patients_df['PMID'].dtype)
print("df PMID dtype:", df['PMID'].dtype)
print("records fetched into df:", len(df))

# count missing
missing_pmids = patients_df[~patients_df['PMID'].isin(df['PMID'])]
print(f"Number of missing PMIDs: {len(missing_pmids)}")
print(missing_pmids.head())

patients_df PMID dtype: int64
df PMID dtype: int32
records fetched into df: 140811
Number of missing PMIDs: 90
      patient_id patient_uid      PMID                         file_path  \
2207        2207   6110280-1  30159206  comm/PMC006xxxxxx/PMC6110280.xml   
2208        2208   6110280-2  30159206  comm/PMC006xxxxxx/PMC6110280.xml   
2209        2209   6110280-3  30159206  comm/PMC006xxxxxx/PMC6110280.xml   
3219        3219   6173321-1  30294454  comm/PMC006xxxxxx/PMC6173321.xml   
7145        7145   6424340-1  30899779  comm/PMC006xxxxxx/PMC6424340.xml   

                                                  title  \
2207  Tacrolimus-Induced Remission in Drug Resistant...   
2208  Tacrolimus-Induced Remission in Drug Resistant...   
2209  Tacrolimus-Induced Remission in Drug Resistant...   
3219  Malignant Pleural Mesothelioma presenting with...   
7145  A Rare Case of HIV-Induced Inflammatory Demyel...   

                                                patient               age  \
