In [19]:
# this only kinda works. it's a mess. i'm not sure if it's worth fixing.

import os
import requests
import time
import csv
from Bio import Entrez
import re
from docx import Document
from fuzzywuzzy import fuzz
import PyPDF2
import shutil
import json

def sanitize_filename(filename):
    # Remove invalid characters and truncate to a reasonable length
    invalid_chars = r'[<>:"/\\|?*]'
    sanitized = re.sub(invalid_chars, '', filename)
    return sanitized[:200]

def extract_references_from_docx(file_path):
    doc = Document(file_path)
    full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
    references = re.findall(r'\d+\.\s+(.*?)(?=\d+\.\s+|\Z)', full_text, re.DOTALL)
    return [ref.strip() for ref in references]

def extract_info(reference):
    # Improved regex patterns
    title_pattern = r'(?:^|\.\s+)(.+?)\.\s+(?:[A-Za-z\s]+\.)?(?:\s*\(.*?\))?\s*\d'
    authors_pattern = r'^((?:(?:[A-Z][a-z]+(?:,?\s+(?:[A-Z]\.|\([A-Z-]+\))){0,2}(?:,\s+|\s+&\s+))*(?:[A-Z][a-z]+(?:,?\s+(?:[A-Z]\.|\([A-Z-]+\))){0,2})|(?:[A-Z][a-z]+ et al\.)|(?:NCD Risk Factor Collaboration \([^)]+\))))'
    year_pattern = r'\((\d{4})\)'
    doi_pattern = r'doi:?\s*(10\.\d{4,}\/\S+)'
    
    title_match = re.search(title_pattern, reference)
    authors_match = re.match(authors_pattern, reference)
    year_match = re.search(year_pattern, reference)
    doi_match = re.search(doi_pattern, reference)
    
    title = title_match.group(1).strip() if title_match else ''
    authors = authors_match.group(1) if authors_match else ''
    year = year_match.group(1) if year_match else ''
    doi = doi_match.group(1) if doi_match else ''
    
    return {'title': title, 'authors': authors, 'year': year, 'doi': doi}

def clean_string(s):
    return re.sub(r'[^\w\s]', '', s)

def search_pubmed(query, max_results=20):
    Entrez.email = "trentleslie@gmail.com"
    try:
        handle = Entrez.esearch(db='pubmed', 
                                sort='relevance', 
                                retmax=max_results,
                                retmode='xml', 
                                term=query)
        results = Entrez.read(handle)
        handle.close()
        return results
    except Exception as e:
        print(f"Error in search_pubmed: {e}")
        return None

def fetch_article_details(pubmed_id):
    try:
        handle = Entrez.efetch(db='pubmed', id=pubmed_id, rettype='xml', retmode='xml')
        records = Entrez.read(handle)
        handle.close()
        if records['PubmedArticle']:
            return records['PubmedArticle'][0]
        else:
            print(f"No article details found for PubMed ID: {pubmed_id}")
            return None
    except Exception as e:
        print(f"Error in fetch_article_details for PubMed ID {pubmed_id}: {e}")
        return None

def get_pmc_id(article):
    try:
        for id_obj in article['PubmedData']['ArticleIdList']:
            if id_obj.attributes['IdType'] == 'pmc':
                return id_obj
    except KeyError:
        pass
    return None

def download_pdf(pmc_id, output_folder):
    url = f'https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/pdf'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            pdf_filename = os.path.join(output_folder, f'{pmc_id}.pdf')
            with open(pdf_filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {pdf_filename}")
            return pdf_filename
        else:
            print(f"Failed to download PDF for PMC ID: {pmc_id}, Status Code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error in download_pdf: {e}")
        return None

def verify_pdf(pdf_path, reference):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages[:5]:  # Check first 5 pages
                text += page.extract_text()
            
            ref_info = extract_info(reference)
            title_match = fuzz.partial_ratio(ref_info['title'].lower(), text.lower()) > 70
            authors_match = any(author.lower() in text.lower() for author in ref_info['authors'].split(','))
            year_match = ref_info['year'] in text
            
            return title_match and (authors_match or year_match)
    except Exception as e:
        print(f"Error verifying PDF {pdf_path}: {e}")
        return False

def search_and_download(ref_info, reference, output_folder, rejected_folder, downloaded_pmcs):
    search_strategies = [
        lambda: f'{ref_info["doi"]}' if ref_info["doi"] else None,
        lambda: f'"{ref_info["title"]}"[Title]' if ref_info["title"] else None,
        lambda: f'{ref_info["authors"]}[Author] AND {ref_info["year"]}[Date - Publication] AND "{ref_info["title"]}"[Title]' if ref_info["authors"] and ref_info["year"] and ref_info["title"] else None,
        lambda: f'{ref_info["authors"]}[Author] AND {ref_info["year"]}[Date - Publication]' if ref_info["authors"] and ref_info["year"] else None,
        lambda: f'{" ".join(ref_info["title"].split()[:5])}[Title] AND {ref_info["year"]}[Date - Publication]' if ref_info["title"] and ref_info["year"] else None,
        lambda: f'{ref_info["authors"].split(",")[0]}[Author] AND {ref_info["year"]}[Date - Publication]' if ref_info["authors"] and ref_info["year"] else None,
        lambda: f'{ref_info["title"]}' if ref_info["title"] else None,
        lambda: f'{ref_info["authors"]}' if ref_info["authors"] else None,
    ]

    print(f"\nProcessing reference: {reference}")
    print(f"Extracted info: {json.dumps(ref_info, indent=2)}")

    for i, strategy in enumerate(search_strategies):
        query = strategy()
        if not query:
            print(f"Strategy {i+1}: Skipped (insufficient information)")
            continue
        
        print(f"\nStrategy {i+1}: {query}")
        search_results = search_pubmed(query)
        
        if search_results and int(search_results['Count']) > 0:
            print(f"Search results: {search_results['Count']} articles found")
            for j, pubmed_id in enumerate(search_results['IdList']):
                print(f"\n  Result {j+1}: PubMed ID {pubmed_id}")
                article = fetch_article_details(pubmed_id)
                if article:
                    article_title = article['MedlineCitation']['Article']['ArticleTitle']
                    print(f"  Article title: {article_title}")
                    pmc_id = get_pmc_id(article)
                    if pmc_id:
                        print(f"  PMC ID: {pmc_id}")
                        if pmc_id in downloaded_pmcs:
                            print(f"  PDF already downloaded for PMC ID: {pmc_id}")
                            continue
                        pdf_filename = download_pdf(pmc_id, output_folder)
                        if pdf_filename:
                            print(f"  Downloaded PDF: {pdf_filename}")
                            downloaded_pmcs.add(pmc_id)
                            if verify_pdf(pdf_filename, reference):
                                new_filename = os.path.join(output_folder, f"{sanitize_filename(reference)}.pdf")
                                os.rename(pdf_filename, new_filename)
                                print(f"  Verified and renamed: {pdf_filename} -> {new_filename}")
                                return new_filename
                            else:
                                rejected_filename = os.path.join(rejected_folder, f"{sanitize_filename(reference)}.pdf")
                                shutil.move(pdf_filename, rejected_filename)
                                print(f"  Verification failed. Moved to: {rejected_filename}")
                    else:
                        print("  No PMC ID available")
                else:
                    print("  Failed to fetch article details")
        else:
            print("No results found")
        
        time.sleep(0.5)  # Sleep to respect API rate limits
    
    print("All strategies exhausted. No matching PDF found.")
    return None


def main():
    docx_file_path = '/home/trent/github/QuickTools/57ISB15US - REFERENCES (1).docx'  # Replace with the path to your DOCX file
    output_folder = './pdf_downloads'  # Folder where verified PDFs will be saved
    rejected_folder = './rejected_pdfs'  # Folder for rejected PDFs
    csv_output_file = 'download_results.csv'  # CSV file to store results
    
    os.makedirs(output_folder, exist_ok=True)
    os.makedirs(rejected_folder, exist_ok=True)
    
    references = extract_references_from_docx(docx_file_path)
    
    results = []
    downloaded_pmcs = set()  # Set to keep track of downloaded PMC IDs
    
    for ref in references:
        print(f"Processing: {ref}")
        ref_info = extract_info(ref)
        
        pdf_filename = search_and_download(ref_info, ref, output_folder, rejected_folder, downloaded_pmcs)
        
        if pdf_filename:
            results.append({'Reference': ref, 'Downloaded': 'Yes', 'Reason': 'PDF downloaded, verified, and renamed', 'Filename': os.path.basename(pdf_filename)})
        else:
            results.append({'Reference': ref, 'Downloaded': 'No', 'Reason': 'Failed to find, download, or verify PDF', 'Filename': ''})
    
    # Write results to CSV
    with open(csv_output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Reference', 'Downloaded', 'Reason', 'Filename']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in results:
            writer.writerow(row)
    
    print(f"Results saved to {csv_output_file}")

if __name__ == '__main__':
    main()

Processing: NCD Risk Factor Collaboration (NCD-RisC). Trends in adult body-mass index in 200 countries from 1975 to 2014: a pooled analysis of 1698 population-based measurement studies with 19·2 million participants. Lancet (London, England) 387, 1377–1396 (2016).

Processing reference: NCD Risk Factor Collaboration (NCD-RisC). Trends in adult body-mass index in 200 countries from 1975 to 2014: a pooled analysis of 1698 population-based measurement studies with 19·2 million participants. Lancet (London, England) 387, 1377–1396 (2016).
Extracted info: {
  "title": "",
  "authors": "NCD Risk Factor Collaboration (NCD-RisC)",
  "year": "2016",
  "doi": ""
}
Strategy 1: Skipped (insufficient information)
Strategy 2: Skipped (insufficient information)
Strategy 3: Skipped (insufficient information)

Strategy 4: NCD Risk Factor Collaboration (NCD-RisC)[Author] AND 2016[Date - Publication]
No results found
Strategy 5: Skipped (insufficient information)

Strategy 6: NCD Risk Factor Collaboratio

Multiple definitions in dictionary at byte 0x2c880 for key /MediaBox
Multiple definitions in dictionary at byte 0x2ccf9 for key /MediaBox
Multiple definitions in dictionary at byte 0x2d072 for key /MediaBox
Multiple definitions in dictionary at byte 0x2d3db for key /MediaBox
Multiple definitions in dictionary at byte 0x2d834 for key /MediaBox
Multiple definitions in dictionary at byte 0x2dc4d for key /MediaBox
Multiple definitions in dictionary at byte 0x2de66 for key /MediaBox
Multiple definitions in dictionary at byte 0x2e0bf for key /MediaBox
Multiple definitions in dictionary at byte 0x2e348 for key /MediaBox
Multiple definitions in dictionary at byte 0x2e5c1 for key /MediaBox
Multiple definitions in dictionary at byte 0x2e75a for key /MediaBox


Downloaded: ./pdf_downloads/PMC5209353.pdf
  Downloaded PDF: ./pdf_downloads/PMC5209353.pdf
  Verification failed. Moved to: ./rejected_pdfs/Ding, C., Chan, Z. & Magkos, F. Lean, but not healthy the ‘metabolically obese, normal-weight’ phenotype. Curr. Opin. Clin. Nutr. Metab. Care 19, 408–417 (2016)..pdf

  Result 2: PubMed ID 27965103
  Article title: Biological Databases for Hematology Research.
  PMC ID: PMC5200935
Downloaded: ./pdf_downloads/PMC5200935.pdf
  Downloaded PDF: ./pdf_downloads/PMC5200935.pdf
  Verification failed. Moved to: ./rejected_pdfs/Ding, C., Chan, Z. & Magkos, F. Lean, but not healthy the ‘metabolically obese, normal-weight’ phenotype. Curr. Opin. Clin. Nutr. Metab. Care 19, 408–417 (2016)..pdf

  Result 3: PubMed ID 32104320
  Article title: Liquisolid technique and its applications in pharmaceutics.
  PMC ID: PMC7032177
Downloaded: ./pdf_downloads/PMC7032177.pdf
  Downloaded PDF: ./pdf_downloads/PMC7032177.pdf
  Verification failed. Moved to: ./rejected_pdfs



  Article title: Placental HTR2A methylation is associated with infant neurobehavioral outcomes.
  PMC ID: PMC3883782
Downloaded: ./pdf_downloads/PMC3883782.pdf
  Downloaded PDF: ./pdf_downloads/PMC3883782.pdf
  Verification failed. Moved to: ./rejected_pdfs/Appleton, S. L. et al. Diabetes and cardiovascular disease outcomes in the metabolically healthy obese phenotype a cohort study. Diabetes Care 36, 2388–94 (2013)..pdf

  Result 15: PubMed ID 22989403
  Article title: Evaluating health visitor assessments of mother-infant interactions: a mixed methods study.
  No PMC ID available

  Result 16: PubMed ID 23498777
  Article title: Transitional services for adolescents with epilepsy in the U.K.: a survey.
  No PMC ID available

  Result 17: PubMed ID 23413423
  Article title: Melatonin: helping to MEND impaired sleep.
  No PMC ID available

  Result 18: PubMed ID 23491523
  Article title: Diabetes and cardiovascular disease outcomes in the metabolically healthy obese phenotype: a cohor

Multiple definitions in dictionary at byte 0x21db0 for key /MediaBox
Multiple definitions in dictionary at byte 0x220a9 for key /MediaBox
Multiple definitions in dictionary at byte 0x22367 for key /MediaBox
Multiple definitions in dictionary at byte 0x22688 for key /MediaBox
Multiple definitions in dictionary at byte 0x228ce for key /MediaBox
Multiple definitions in dictionary at byte 0x22b1f for key /MediaBox
Multiple definitions in dictionary at byte 0x22da8 for key /MediaBox
Multiple definitions in dictionary at byte 0x23021 for key /MediaBox
Multiple definitions in dictionary at byte 0x232aa for key /MediaBox
Multiple definitions in dictionary at byte 0x23453 for key /MediaBox


Downloaded: ./pdf_downloads/PMC6256246.pdf
  Downloaded PDF: ./pdf_downloads/PMC6256246.pdf
  Verification failed. Moved to: ./rejected_pdfs/Xu, X. et al. Habitual sleep duration and sleep duration variation are independently associated with body mass index. Int. J. Obes. (Lond). 42, 794–800 (2018)..pdf

  Result 9: PubMed ID 30592961
  Article title: [Research status of olprinone in cardiovascular diseases].
  No PMC ID available

  Result 10: PubMed ID 30554781
  Article title: Tac1-Expressing Neurons in the Periaqueductal Gray Facilitate the Itch-Scratching Cycle via Descending Regulation.
  No PMC ID available

  Result 11: PubMed ID 30482248
  Article title: Soluble immune checkpoints in cancer: production, function and biological significance.
  PMC ID: PMC6260693
Downloaded: ./pdf_downloads/PMC6260693.pdf
  Downloaded PDF: ./pdf_downloads/PMC6260693.pdf
  Verification failed. Moved to: ./rejected_pdfs/Xu, X. et al. Habitual sleep duration and sleep duration variation are indepen

Multiple definitions in dictionary at byte 0x1d403 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d701 for key /MediaBox
Multiple definitions in dictionary at byte 0x1da0f for key /MediaBox
Multiple definitions in dictionary at byte 0x1dbed for key /MediaBox
Multiple definitions in dictionary at byte 0x1de23 for key /MediaBox
Multiple definitions in dictionary at byte 0x1e039 for key /MediaBox
Multiple definitions in dictionary at byte 0x1e287 for key /MediaBox
Multiple definitions in dictionary at byte 0x1e61d for key /MediaBox
Multiple definitions in dictionary at byte 0x1e856 for key /MediaBox
Multiple definitions in dictionary at byte 0x1ead7 for key /MediaBox
Multiple definitions in dictionary at byte 0x1ed28 for key /MediaBox


Downloaded: ./pdf_downloads/PMC6295621.pdf
  Downloaded PDF: ./pdf_downloads/PMC6295621.pdf
  Verification failed. Moved to: ./rejected_pdfs/Xu, X. et al. Habitual sleep duration and sleep duration variation are independently associated with body mass index. Int. J. Obes. (Lond). 42, 794–800 (2018)..pdf

  Result 13: PubMed ID 30522056
  Article title: Discovery and development of small molecule modulators targeting glutamine metabolism.
  No PMC ID available

  Result 14: PubMed ID 30675194
  Article title: Circulating tumor DNA detection: A potential tool for colorectal cancer management.
  PMC ID: PMC6341840
Downloaded: ./pdf_downloads/PMC6341840.pdf
  Downloaded PDF: ./pdf_downloads/PMC6341840.pdf
  Verification failed. Moved to: ./rejected_pdfs/Xu, X. et al. Habitual sleep duration and sleep duration variation are independently associated with body mass index. Int. J. Obes. (Lond). 42, 794–800 (2018)..pdf

  Result 15: PubMed ID 30552084
  Article title: Predicting Prediabetes Thr

KeyboardInterrupt: 