# problem statement 
- **Your task is to write a Python program to fetch research papers based on a user-specified query. The
  program must identify papers with at least one author affiliated with a pharmaceutical or biotech
  company and return the results as a CSV file.**

In [1]:
import argparse
import csv
import requests
import re
import xml.etree.ElementTree as ET
from typing import List, Dict
import sys

In [2]:
# Constants
PUBMED_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"


In [3]:
def fetch_paper_ids(query: str, retmax: int = 300) -> List[str]:
    """
    Fetch paper IDs from PubMed based on the query.
    """
    params = {
        'db': 'pubmed',
        'term': query,
        'retmax': retmax,  # Number of results to fetch
        'retmode': 'json'
    }
    response = requests.get(PUBMED_API_URL, params=params)
    response.raise_for_status()
    data = response.json()
    return data.get('esearchresult', {}).get('idlist', [])

In [4]:
def fetch_paper_details(paper_ids: List[str]) -> str:
    """
    Fetch paper details using IDs.
    """
    params = {
        'db': 'pubmed',
        'id': ','.join(paper_ids),
        'retmode': 'xml'
    }
    response = requests.get(PUBMED_FETCH_URL, params=params)
    response.raise_for_status()
    return response.text  # XML response to be parsed

In [5]:
def extract_relevant_info(xml_response: str) -> List[Dict]:
    """
    Parse XML response to extract required fields for each article.
    """
    root = ET.fromstring(xml_response)
    articles = []

    for article in root.findall('.//PubmedArticle'):
        pubmed_id = article.findtext('.//PMID')
        title = article.findtext('.//ArticleTitle')
        
        # Extract publication date
        pub_date = article.find('.//PubDate')
        if pub_date is not None:
            year = pub_date.findtext('Year')
            month = pub_date.findtext('Month')
            day = pub_date.findtext('Day')
            publication_date = f"{year or ''}-{month or ''}-{day or ''}".strip('-')
        else:
            publication_date = None
        
        # Extract authors and affiliations
        authors = article.findall('.//Author')
        non_academic_authors = []
        company_affiliations = []
        corresponding_author_email = None

        for author in authors:
            affiliation = author.findtext('.//AffiliationInfo/Affiliation')
            if affiliation and ('university' not in affiliation.lower() and 'college' not in affiliation.lower()):
                last_name = author.findtext('LastName')
                first_name = author.findtext('ForeName')
                full_name = f"{first_name or ''} {last_name or ''}".strip()
                if full_name:
                    non_academic_authors.append(full_name)
                company_affiliations.append(affiliation)
                # Extract email using regex
                email_match = re.search(r'[\w\.-]+@[\w\.-]+', affiliation)
                if email_match:
                    corresponding_author_email = email_match.group(0)

        articles.append({
            'PubmedID': pubmed_id,
            'Title': title,
            'Publication Date': publication_date,
            'Non-academic Author(s)': ', '.join(non_academic_authors),
            'Company Affiliation(s)': ', '.join(company_affiliations),
            'Corresponding Author Email': corresponding_author_email
        })

    return articles


In [6]:
def save_to_csv(data: List[Dict], filename: str):
    """
    Save extracted data to CSV file.
    """
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=[
            'PubmedID', 'Title', 'Publication Date', 'Non-academic Author(s)',
            'Company Affiliation(s)', 'Corresponding Author Email'
        ])
        writer.writeheader()
        writer.writerows(data)


In [7]:

def main():
    # Simulate command-line arguments for Jupyter Notebook
    sys.argv = ['script_name', 'cancer therapy', '-f', 'output.csv', '-d']  # Example arguments

    parser = argparse.ArgumentParser(description="Fetch PubMed Papers")
    parser.add_argument('query', type=str, help='Search query for PubMed')
    parser.add_argument('-f', '--file', type=str, help='Output CSV file')
    parser.add_argument('-d', '--debug', action='store_true', help='Enable debug mode')
    args = parser.parse_args()

    if args.debug:
        print(f"Query: {args.query}")

    try:
        paper_ids = fetch_paper_ids(args.query)
        if args.debug:
            print(f"Fetched IDs: {paper_ids}")

        xml_response = fetch_paper_details(paper_ids)
        data = extract_relevant_info(xml_response)

        if args.file:
            save_to_csv(data, args.file)
            print(f"Results saved to {args.file}")
        else:
            print(data)
    except Exception as e:
        print(f"Error: {e}")

if __name__ == '__main__':
    main()


Query: cancer therapy
Fetched IDs: ['39789673', '39789641', '39789640', '39789615', '39789613', '39789606', '39789599', '39789555', '39789554', '39789474', '39789471', '39789445', '39789407', '39789372', '39789356', '39789336', '39789298', '39789280', '39789258', '39789257', '39789247', '39789208', '39789190', '39789183', '39789150', '39789071', '39789066', '39789065', '39789010', '39788975', '39788967', '39788945', '39788939', '39788918', '39788912', '39788889', '39788886', '39788877', '39788823', '39788816', '39788744', '39788650', '39788584', '39788569', '39788567', '39788558', '39788520', '39788504', '39788501', '39788500', '39788496', '39788485', '39788431', '39788429', '39788400', '39788396', '39788388', '39788373', '39788354', '39788274', '39788152', '39788148', '39788133', '39788061', '39787980', '39787941', '39787915', '39787807', '39787793', '39787708', '39787688', '39787636', '39787595', '39787561', '39787462', '39787453', '39787447', '39787436', '39787419', '39787392', '397