In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import json
from pathlib import Path
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [5]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class CochraneEffectSizeExtractor:
    """Extract effect sizes from Cochrane Library systematic reviews."""
    
    def __init__(self, output_dir="cochrane_data"):
        self.base_url = "https://www.cochranelibrary.com"
        self.search_url = f"{self.base_url}/advanced-search"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # Set up Selenium
        self.setup_selenium()
        
    def setup_selenium(self):
        """Set up Selenium webdriver for handling dynamic content."""
        options = Options()
        options.add_argument("--headless")  # Run in headless mode
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=options)
        
    def search_reviews(self, query, page=1, per_page=25):
        """
        Search Cochrane Library for reviews matching query.
        
        Args:
            query: Search query string
            page: Page number
            per_page: Results per page
            
        Returns:
            List of review URLs
        """
        params = {
            "searchBy": "1",
            "searchText": query,
            "searchType": "basic",
            "facetMode": "reset",
            "facetTab": "reviews",
            "facetToken": "",
            "page": page,
            "pageSize": per_page,
            "sortBy": "relevancy"
        }
        
        logger.info(f"Searching Cochrane Library for: {query}")
        response = requests.get(self.search_url, params=params, headers=self.headers)
        
        if response.status_code != 200:
            logger.error(f"Search failed with status code: {response.status_code}")
            return []
            
        soup = BeautifulSoup(response.text, 'html.parser')
        review_links = []
        
        # Extract links to reviews
        for result in soup.select(".search-results-item-body"):
            title_element = result.select_one(".result-title")
            if title_element and title_element.has_attr('href'):
                review_links.append(f"{self.base_url}{title_element['href']}")
                
        logger.info(f"Found {len(review_links)} reviews")
        return review_links
        
    def extract_effect_sizes_from_page(self, url):
        """
        Extract effect sizes from a Cochrane review page.
        
        Args:
            url: URL of the Cochrane review
            
        Returns:
            Dictionary containing effect size data
        """
        logger.info(f"Extracting effect sizes from: {url}")
        
        # Use Selenium to load the page (handles JavaScript)
        self.driver.get(url)
        
        # Wait for content to load
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".publication-content"))
        )
        
        # Get page source after JavaScript execution
        html = self.driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        # Extract review metadata
        title = soup.select_one("h1.publication-title")
        title_text = title.text.strip() if title else "Unknown Title"
        
        # Extract DOI
        doi_element = soup.select_one(".publication-doi")
        doi = doi_element.text.strip() if doi_element else None
        
        # Find forest plots (usually contain effect sizes)
        figures = soup.select("figure.figure")
        
        results = {
            "title": title_text,
            "url": url,
            "doi": doi,
            "effect_sizes": []
        }
        
        # Process tables (often contain effect sizes)
        tables = soup.select("table.table")
        for i, table in enumerate(tables):
            table_data = self._extract_data_from_table(table)
            if table_data:
                results["effect_sizes"].extend(table_data)
        
        # Try to find effect sizes in text
        effect_sizes_in_text = self._extract_effect_sizes_from_text(soup)
        if effect_sizes_in_text:
            results["effect_sizes"].extend(effect_sizes_in_text)
            
        logger.info(f"Extracted {len(results['effect_sizes'])} effect size entries")
        return results
        
    def _extract_data_from_table(self, table):
        """Extract effect size data from a table element."""
        effect_sizes = []
        
        # Check if this table likely contains effect sizes
        headers = [th.text.strip().lower() for th in table.select("th")]
        
        effect_size_indicators = [
            "effect", "risk ratio", "odds ratio", "hazard ratio", 
            "mean difference", "std. mean difference", "rate ratio",
            "rr", "or", "hr", "md", "smd", "ci", "confidence"
        ]
        
        if not any(indicator in " ".join(headers).lower() for indicator in effect_size_indicators):
            return []
            
        # Process table rows
        rows = table.select("tr")
        if len(rows) <= 1:  # Skip tables with just headers
            return []
            
        # Extract column indices for relevant data
        col_indices = {}
        for i, header in enumerate(headers):
            for term, pattern in [
                ("intervention", r"interven|treatment|therapy|drug|comparison"),
                ("outcome", r"outcome|endpoint|measure"),
                ("effect_size", r"effect|estimate|ratio|difference|rr|or|hr|md|smd"),
                ("ci_lower", r"lower|95%.*lower|ci.*lower"),
                ("ci_upper", r"upper|95%.*upper|ci.*upper"),
                ("p_value", r"p.*value|p\b"),
                ("measure_type", r"measure|type|statistic")
            ]:
                if re.search(pattern, header, re.IGNORECASE):
                    col_indices[term] = i
                    
        # Process data rows
        for row in rows[1:]:  # Skip header row
            cells = row.select("td")
            if len(cells) < len(headers):
                continue
                
            # Extract data from cells based on identified columns
            effect_size_data = {
                "intervention": self._get_cell_content(cells, col_indices.get("intervention")),
                "outcome": self._get_cell_content(cells, col_indices.get("outcome")),
                "effect_size": self._get_cell_content(cells, col_indices.get("effect_size")),
                "ci_lower": self._get_cell_content(cells, col_indices.get("ci_lower")),
                "ci_upper": self._get_cell_content(cells, col_indices.get("ci_upper")),
                "p_value": self._get_cell_content(cells, col_indices.get("p_value")),
                "measure_type": self._get_cell_content(cells, col_indices.get("measure_type")),
                "source": "table"
            }
            
            # Clean and normalize numeric values
            for field in ["effect_size", "ci_lower", "ci_upper", "p_value"]:
                if effect_size_data[field]:
                    effect_size_data[field] = self._normalize_numeric(effect_size_data[field])
                    
            # Infer measure type if missing
            if not effect_size_data["measure_type"] and effect_size_data["effect_size"]:
                measure_type = None
                for header in headers:
                    if "odds ratio" in header.lower() or "or" == header.lower():
                        measure_type = "Odds Ratio"
                    elif "risk ratio" in header.lower() or "rr" == header.lower():
                        measure_type = "Risk Ratio"
                    elif "hazard ratio" in header.lower() or "hr" == header.lower():
                        measure_type = "Hazard Ratio"
                    elif "mean difference" in header.lower() or "md" == header.lower():
                        measure_type = "Mean Difference"
                    elif "std" in header.lower() and "mean difference" in header.lower() or "smd" == header.lower():
                        measure_type = "Standardized Mean Difference"
                        
                effect_size_data["measure_type"] = measure_type
                
            # Add to collection if we have actual effect size data
            if effect_size_data["effect_size"]:
                effect_sizes.append(effect_size_data)
                
        return effect_sizes
    
    def _get_cell_content(self, cells, idx):
        """Extract text content from a table cell safely."""
        if idx is not None and idx < len(cells):
            return cells[idx].text.strip()
        return None
        
    def _normalize_numeric(self, value):
        """Clean and normalize numeric values from text."""
        if not value:
            return value
            
        # Remove common non-numeric characters
        value = re.sub(r'[^\d\.\-\+]', '', value)
        
        try:
            return float(value)
        except ValueError:
            return value
            
    def _extract_effect_sizes_from_text(self, soup):
        """Extract effect sizes mentioned in the text content."""
        effect_sizes = []
        content_sections = soup.select(".publication-content p")
        
        # Patterns to identify effect sizes in text
        patterns = [
            # Risk Ratio/Odds Ratio pattern: RR 1.23 (95% CI 1.11 to 1.35)
            r'(Risk Ratio|Odds Ratio|Hazard Ratio|Rate Ratio|RR|OR|HR)\s*(=|:)?\s*(\d+\.\d+)\s*\(95%\s*CI\s*(\d+\.\d+)\s*to\s*(\d+\.\d+)\)',
            
            # Mean Difference pattern: MD -2.30 (95% CI -4.11 to -0.49)
            r'(Mean Difference|Std\. Mean Difference|Standardized Mean Difference|MD|SMD)\s*(=|:)?\s*([\-\+]?\d+\.\d+)\s*\(95%\s*CI\s*([\-\+]?\d+\.\d+)\s*to\s*([\-\+]?\d+\.\d+)\)',
            
            # P-value pattern: P = 0.002
            r'P\s*(=|:)\s*(\d+\.\d+)'
        ]
        
        for section in content_sections:
            text = section.text
            
            # Extract outcome context
            outcome_match = re.search(r'for\s+([^\.]+)', text)
            outcome = outcome_match.group(1).strip() if outcome_match else None
            
            # Extract intervention context
            intervention_match = re.search(r'(compared|versus|vs\.?)\s+([^\.]+)', text, re.IGNORECASE)
            intervention = intervention_match.group(2).strip() if intervention_match else None
            
            # Look for effect size patterns
            for pattern in patterns:
                matches = re.finditer(pattern, text)
                for match in matches:
                    groups = match.groups()
                    
                    if len(groups) >= 3:  # Full effect size with CI
                        effect_size_data = {
                            "intervention": intervention,
                            "outcome": outcome,
                            "measure_type": groups[0],
                            "effect_size": float(groups[2]),
                            "source": "text"
                        }
                        
                        if len(groups) >= 5:  # Has confidence interval
                            effect_size_data["ci_lower"] = float(groups[3])
                            effect_size_data["ci_upper"] = float(groups[4])
                            
                        effect_sizes.append(effect_size_data)
                    
        return effect_sizes
    
    def extract_effect_sizes_for_topic(self, topic, max_reviews=10):
        """
        Extract effect sizes for a specific medical topic.
        
        Args:
            topic: Medical topic or condition
            max_reviews: Maximum number of reviews to process
            
        Returns:
            DataFrame with extracted effect sizes
        """
        # Search for reviews containing the topic and likely to have effect sizes
        query = f'"{topic}" AND ("meta-analysis" OR "effect size" OR "forest plot")'
        review_links = self.search_reviews(query)
        
        all_effect_sizes = []
        
        # Process each review up to the maximum
        for i, link in enumerate(review_links[:max_reviews]):
            logger.info(f"Processing review {i+1}/{min(len(review_links), max_reviews)}")
            
            try:
                review_data = self.extract_effect_sizes_from_page(link)
                
                # Add metadata to each effect size entry
                for effect in review_data["effect_sizes"]:
                    effect["review_title"] = review_data["title"]
                    effect["review_url"] = review_data["url"]
                    effect["review_doi"] = review_data["doi"]
                    effect["topic"] = topic
                    
                all_effect_sizes.extend(review_data["effect_sizes"])
                
                # Save intermediate results
                self._save_intermediate_results(all_effect_sizes, topic, i+1)
                
            except Exception as e:
                logger.error(f"Error processing {link}: {str(e)}")
                
            # Be nice to the server
            time.sleep(2)
            
        # Create DataFrame
        df = pd.DataFrame(all_effect_sizes)
        
        # Save final results
        output_path = self.output_dir / f"{topic}_effect_sizes.csv"
        df.to_csv(output_path, index=False)
        logger.info(f"Saved {len(df)} effect sizes to {output_path}")
        
        return

In [10]:
extractor = CochraneEffectSizeExtractor(output_dir="/Users/yiquntchen/Desktop/chen-lab/MEDAL/")
df = extractor.extract_effect_sizes_for_topic("", max_reviews=10)

2025-03-25 13:58:20,895 - INFO - Searching Cochrane Library for: "" AND ("meta-analysis" OR "effect size" OR "forest plot")
2025-03-25 13:58:29,829 - INFO - Found 0 reviews
2025-03-25 13:58:29,832 - INFO - Saved 0 effect sizes to /Users/yiquntchen/Desktop/chen-lab/MEDAL/_effect_sizes.csv


In [11]:
df

In [12]:
 import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

BASE_URL = "https://www.cochranelibrary.com"
SEARCH_URL = "https://www.cochranelibrary.com/cdsr/reviews/topics"
HEADERS = {"User-Agent": "Mozilla/5.0"}

def get_recent_reviews(years_back=10, max_pages=5):
    review_links = []
    for page in range(1, max_pages + 1):
        url = f"https://www.cochranelibrary.com/cdsr/reviews?page={page}"
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text, "html.parser")

        links = soup.find_all("a", href=True)
        for link in links:
            href = link['href']
            if href.startswith("/cdsr/doi/") and "full" not in href:
                full_link = BASE_URL + href + "/full"
                review_links.append(full_link)

        time.sleep(1)  # Be kind to the server

    return list(set(review_links))

def extract_sof_table(cochrane_url):
    response = requests.get(cochrane_url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to fetch {cochrane_url}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    sof_tables = soup.find_all("table")
    sof_data = []

    for table in sof_tables:
        if 'Summary of findings' in table.get_text():
            rows = table.find_all("tr")
            for row in rows:
                cols = row.find_all(["td", "th"])
                cols = [c.get_text(strip=True) for c in cols]
                if len(cols) > 1:
                    sof_data.append(cols)

    if sof_data:
        return pd.DataFrame(sof_data)
    return None

def extract_themes(soup):
    theme_tags = soup.find_all("a", class_="taxonomy-link")
    themes = [tag.get_text(strip=True) for tag in theme_tags]
    return themes


In [13]:
links = get_recent_reviews()

In [14]:
url = f"https://www.cochranelibrary.com/cdsr/reviews?page={page}"
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")

[]

In [29]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import time

# Replace this with your actual Wiley TDM API token
TDM_API_TOKEN = "***WILEY_TDM_API_TOKEN_REDACTED***"

WILEY_HEADERS = {
    "apikey": TDM_API_TOKEN,
    "Accept": "application/xml"
}

CROSSREF_API = "https://api.crossref.org/works"


def search_cochrane_reviews_crossref(rows=10):
    query = "cochrane systematic review"
    params = {
        "query": query,
        "filter": "type:journal-article,container-title:Cochrane Database of Systematic Reviews",
        "rows": rows
    }
    response = requests.get(CROSSREF_API, params=params)

    if response.status_code != 200:
        print("Crossref search failed:", response.status_code, response.text)
        return []

    items = response.json().get("message", {}).get("items", [])
    dois = [item["DOI"] for item in items if "DOI" in item]
    return dois


def get_fulltext_xml_by_doi(doi):
    encoded_doi = urllib.parse.quote(doi)
    meta_url = f"https://api.wiley.com/onlinelibrary/tdm/v1/articles/{encoded_doi}/metadata"
    meta_response = requests.get(meta_url, headers=WILEY_HEADERS)

    if meta_response.status_code != 200:
        print(f"Metadata not available for DOI: {doi}")
        return None

    full_url = f"https://api.wiley.com/onlinelibrary/tdm/v1/articles/{encoded_doi}/full"
    full_response = requests.get(full_url, headers=WILEY_HEADERS)

    if full_response.status_code == 200:
        return full_response.text
    else:
        print(f"Failed to fetch fulltext for DOI: {doi} | Status: {full_response.status_code}")
        return None


def parse_jats_for_key_info(xml_text):
    root = ET.fromstring(xml_text)
    ns = {'j': 'http://jats.nlm.nih.gov'}

    title = root.findtext(".//j:article-title", default="", namespaces=ns)
    abstract = root.findtext(".//j:abstract//j:p", default="", namespaces=ns)

    sections = root.findall(".//j:sec", namespaces=ns)
    section_texts = []
    for sec in sections:
        heading = sec.findtext("j:title", default="", namespaces=ns)
        paras = [p.text for p in sec.findall("j:p", namespaces=ns) if p.text]
        section_texts.append({"heading": heading, "content": " ".join(paras)})

    return {
        "title": title,
        "abstract": abstract,
        "sections": section_texts
    }



In [30]:
articles = search_cochrane_reviews(rows=10)

Search failed: 400 


In [31]:
articles

[]

In [None]:
all_data = []

for article in articles:
    article_id = article.findtext("article-id")
    print(f"Processing article ID: {article_id}")

    xml_text = get_fulltext_xml(article)
    if xml_text:
        parsed = parse_jats_for_key_info(xml_text)
        parsed["article_id"] = article_id
        all_data.append(parsed)
    else:
        print("No XML fulltext available")

    time.sleep(1)  # Respectful delay

pd.DataFrame(all_data).to_json("wiley_cochrane_reviews.json", indent=2)
print("Saved parsed data to wiley_cochrane_reviews.json")


In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

BASE_URL = "https://www.cochranelibrary.com"
SEARCH_URL = "https://www.cochranelibrary.com/cdsr/reviews"

def get_review_links(pages=1):
    links = []
    for page in range(1, pages + 1):
        url = f"{SEARCH_URL}?page={page}"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch page {page}")
            continue
        soup = BeautifulSoup(response.text, "html.parser")
        for link in soup.select("a.search-result-title-link"):
            href = link.get("href")
            if href and "/cdsr/" in href:
                links.append(BASE_URL + href)
        time.sleep(1)
    return links

def extract_sof_table(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch: {url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    sof_tables = soup.select("table.sof-table")
    all_rows = []

    for table in sof_tables:
        headers = [th.get_text(strip=True) for th in table.select("thead th")]
        for row in table.select("tbody tr"):
            cells = [td.get_text(strip=True) for td in row.select("td")]
            if len(cells) == len(headers):
                all_rows.append(dict(zip(headers, cells)))
    
    return all_rows


In [33]:

review_links = get_review_links(pages=2)  # You can increase this
print(f"Found {len(review_links)} reviews")

Failed to fetch page 1
Failed to fetch page 2
Found 0 reviews


In [None]:


all_data = []
for url in review_links:
    print(f"Scraping: {url}")
    sof_data = extract_sof_table(url)
    all_data.append({
        "url": url,
        "sof_table": sof_data
    })
    time.sleep(1)

pd.DataFrame(all_data).to_json("cochrane_sof_tables.json", indent=2)
print("Saved extracted SoF tables to cochrane_sof_tables.json")



In [35]:
def search_cochrane_reviews(start_year, end_year, rows=20):
    query = "Cochrane Database of Systematic Reviews"
    filters = {
        "from-pub-date": f"{start_year}-01-01",
        "until-pub-date": f"{end_year}-12-31",
        "type": "journal-article"
    }
    results = cr.works(query=query, filter=filters, limit=rows)
    return results['message']['items']


In [36]:
def extract_information(articles):
    extracted_data = []
    for article in articles:
        doi = article.get('DOI', 'N/A')
        title = article.get('title', ['N/A'])[0]
        pub_date = article.get('published-print', {}).get('date-parts', [['N/A']])[0][0]
        link = f"https://doi.org/{doi}"
        extracted_data.append({
            'DOI': doi,
            'Title': title,
            'Publication Year': pub_date,
            'Link': link
        })
    return extracted_data


In [39]:
start_year = 2015
end_year = 2025
num_articles = 50

articles = search_cochrane_reviews(start_year, end_year, rows=num_articles)
extracted_data = extract_information(articles)

2025-03-29 23:43:57,515 - INFO - HTTP Request: GET https://api.crossref.org/works?query=Cochrane+Database+of+Systematic+Reviews&filter=from-pub-date%3A2015-01-01%2Cuntil-pub-date%3A2025-12-31%2Ctype%3Ajournal-article&rows=50 "HTTP/1.1 200 OK"


In [41]:
len(articles)

50

In [43]:
articles[0].keys()

dict_keys(['indexed', 'reference-count', 'publisher', 'content-domain', 'DOI', 'type', 'created', 'update-policy', 'source', 'is-referenced-by-count', 'title', 'prefix', 'author', 'member', 'published-online', 'reference', 'updated-by', 'container-title', 'language', 'link', 'deposited', 'score', 'resource', 'editor', 'issued', 'references-count', 'URL', 'ISSN', 'issn-type', 'published'])

In [45]:
articles[0]['DOI']

'10.1002/14651858.cd013268'