In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_article_links(journal_url):
    """
    Collect links to all issues listed on the journal's RePEc page.
    """
    response = requests.get(journal_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    issue_links = [
        a['href'] for a in soup.select('ul li a') 
        if a.has_attr('href') and '/bla/jfinan/' in a['href']
    ]
    return ["https://ideas.repec.org" + link for link in issue_links]

def parse_issue(issue_url, journal_name):
    """
    Parse a single issue page and extract all papers.
    """
    papers = []
    response = requests.get(issue_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Try to extract year from the header
    h1_text = soup.find("h1").get_text()
    year = next((word for word in h1_text.split() if word.isdigit() and len(word) == 4), None)

    for paper in soup.select('.paper'):
        title = paper.select_one('h3').get_text(strip=True)
        abstract_tag = paper.select_one('p.abstract')
        abstract = abstract_tag.get_text(strip=True) if abstract_tag else None

        authors_tag = paper.select_one('p.authors')
        authors = authors_tag.get_text(strip=True).replace("Authors:", "") if authors_tag else None

        papers.append({
            'journal': journal_name,
            'year': year,
            'title': title,
            'abstract': abstract,
            'authors': authors
        })

    return papers

def scrape_journal(journal_url, journal_name, max_issues=None):
    """
    Scrape multiple issues from a journal's RePEc page.
    """
    issue_links = get_article_links(journal_url)
    if max_issues:
        issue_links = issue_links[:max_issues]

    all_papers = []
    for i, issue_link in enumerate(issue_links):
        print(f"Scraping issue {i+1}/{len(issue_links)}: {issue_link}")
        try:
            papers = parse_issue(issue_link, journal_name)
            all_papers.extend(papers)
            time.sleep(1)  # Be polite to the server
        except Exception as e:
            print(f"Failed on {issue_link}: {e}")

    return pd.DataFrame(all_papers)


In [4]:
df = scrape_journal("https://ideas.repec.org/s/bla/jfinan.html", "Journal of Finance", max_issues=10)


Scraping issue 1/10: https://ideas.repec.org/a/bla/jfinan/v52y1997i1p57-82.html
Scraping issue 2/10: https://ideas.repec.org/a/bla/jfinan/v25y1970i2p383-417.html
Scraping issue 3/10: https://ideas.repec.org/a/bla/jfinan/v80y2025i2p657-698.html
Scraping issue 4/10: https://ideas.repec.org/a/bla/jfinan/v80y2025i2p699-754.html
Scraping issue 5/10: https://ideas.repec.org/a/bla/jfinan/v80y2025i2p755-781.html
Scraping issue 6/10: https://ideas.repec.org/a/bla/jfinan/v80y2025i2p783-832.html
Scraping issue 7/10: https://ideas.repec.org/a/bla/jfinan/v80y2025i2p833-873.html
Scraping issue 8/10: https://ideas.repec.org/a/bla/jfinan/v80y2025i2p875-910.html
Scraping issue 9/10: https://ideas.repec.org/a/bla/jfinan/v80y2025i2p911-936.html
Scraping issue 10/10: https://ideas.repec.org/a/bla/jfinan/v80y2025i2p937-979.html


In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# ---- STEP 1: Get article links from all pages ----
def get_article_links(journal_base_url, max_page=10):
    all_links = []

    for i in range(1, max_page + 1):
        suffix = "" if i == 1 else str(i)
        url = f"{journal_base_url}{suffix}.html"
        print(f"🌐 Fetching: {url}")
        try:
            resp = requests.get(url)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, 'html.parser')

            for a in soup.select('ul li a'):
                href = a.get('href')
                if href and href.startswith('/a/bla/jfinan/'):
                    full_url = 'https://ideas.repec.org' + href
                    all_links.append(full_url)
        except Exception as e:
            print(f"❌ Skipping {url}: {e}")
        time.sleep(1)

    print(f"✅ Total article links collected: {len(all_links)}")
    return list(set(all_links))  # remove duplicates just in case

# ---- STEP 2: Parse each article ----
def parse_article(article_url):
    try:
        resp = requests.get(article_url)
        soup = BeautifulSoup(resp.text, 'html.parser')

        # Abstract
        abstract = None
        meta = soup.find('meta', attrs={'name': 'description'})
        if meta and meta.get('content'):
            raw_abstract = meta['content'].strip()
            abstract = re.sub(r'^Downloadable(\s*\(with restrictions\))?!\s*', '', raw_abstract)

        authors = year = title = journal = None

        h2 = soup.find('h2', string=re.compile(r'Suggested Citation'))
        if h2:
            citation_div = h2.find_next_sibling('div', id='biblio-body')
            if citation_div:
                li = citation_div.find('li', class_=lambda c: c and 'list-group-item' in c)
                if li:
                    text_nodes = [n for n in li.contents if isinstance(n, str)]
                    if text_nodes:
                        raw = text_nodes[0].strip()
                        if ',' in raw:
                            author_part, year_part = raw.rsplit(',', 1)
                            authors = author_part.strip()
                            year_match = re.search(r'\b(19|20)\d{2}\b', year_part)
                            year = year_match.group(0) if year_match else None

                    b = li.find('b')
                    if b:
                        a = b.find('a')
                        title = a.get_text(strip=True) if a else b.get_text(strip=True)

                    a_tags = li.find_all('a')
                    if len(a_tags) >= 2:
                        journal = a_tags[1].get_text(strip=True)

        return {
            'url': article_url,
            'journal': journal,
            'year': year,
            'title': title,
            'abstract': abstract,
            'authors': authors
        }

    except Exception as e:
        print(f"❌ Failed to parse {article_url}: {e}")
        return None

# ---- STEP 3: Scrape journal across pages ----
def scrape_journal(journal_base_url="https://ideas.repec.org/s/bla/jfinan", max_page=10, sleep_sec=1):
    links = get_article_links(journal_base_url, max_page)
    records = []

    for i, url in enumerate(links):
        print(f"🔄 Parsing ({i+1}/{len(links)}): {url}")
        record = parse_article(url)
        if record:
            records.append(record)
        time.sleep(sleep_sec)

    df = pd.DataFrame(records)
    return df

# ---- Run ----
if __name__ == "__main__":
    df = scrape_journal(max_page=10)
    print(f"\n✅ Done. Extracted {len(df)} articles.")
    df.to_csv("jfinance_articles_up_to_page10.csv", index=False)


🌐 Fetching: https://ideas.repec.org/s/bla/jfinan.html
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan2.html
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan3.html
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan4.html
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan5.html
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan6.html
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan7.html
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan8.html
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan9.html
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan10.html
✅ Total article links collected: 2000
🔄 Parsing (1/1969): https://ideas.repec.org/a/bla/jfinan/v73y2018i4p1937-1951.html
🔄 Parsing (2/1969): https://ideas.repec.org/a/bla/jfinan/v64y2009i2p785-821.html
🔄 Parsing (3/1969): https://ideas.repec.org/a/bla/jfinan/v70y2015i5p1997-2028.html
🔄 Parsing (4/1969): https://ideas.repec.org/a/bla/jfinan/v62y2007i5p2061-2096.html
🔄 Parsing (5/1969): https://ideas.repec.org/a/bla/jfinan/v71y2016i6p2781-2808.html

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def parse_article_flexible(article_url, journal_name="Journal of Finance"):
    response = requests.get(article_url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Title
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else None

    # Abstract
    abstract = None
    abstract_tag = soup.find("p", class_="abstract")
    if abstract_tag:
        abstract = abstract_tag.get_text(strip=True)
    else:
        # Fallback: look for <meta name="description">
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc and meta_desc.get("content"):
            abstract = meta_desc["content"].strip()

    # Authors
    # Authors
    authors = None
    authors_tag = soup.find("p", class_="authors")
    if authors_tag:
        authors = authors_tag.get_text(strip=True).replace("Authors:", "")
    else:
        # Try meta fallback
        author_tags = soup.find_all("meta", attrs={"name": "citation_author"})
        if author_tags:
            authors = "; ".join(tag["content"] for tag in author_tags if "content" in tag.attrs)
        else:
            # Look for 'p.title' with 'by XYZ'
            title_para = soup.find("p", class_="title")
            if title_para and "by " in title_para.text:
                authors = title_para.text.strip().replace("by ", "").strip()
            else:
                # Last resort: parse from "Suggested Citation"
                citation_block = soup.find("div", class_="note")
                if citation_block:
                    citation_text = citation_block.get_text(" ", strip=True)
                    parts = citation_text.split(".", 2)
                    if len(parts) >= 2:
                        authors = parts[0].strip()

    # Year
    year = None
    bibref_tag = soup.find("p", class_="bibref")
    if bibref_tag:
        for word in bibref_tag.text.split():
            if word.isdigit() and len(word) == 4:
                year = word
                break

    # Fallback: parse from URL
    if not year:
        import re
        match = re.search(r'y(\d{4})', article_url)
        if match:
            year = match.group(1)

    return {
        "journal": journal_name,
        "year": year,
        "title": title,
        "abstract": abstract,
        "authors": authors
    }


In [19]:
url = "https://ideas.repec.org/a/bla/jfinan/v52y1997i1p57-82.html"
article_data = parse_article_flexible(url)
df = pd.DataFrame([article_data])
print(df)


              journal  year                                      title  \
0  Journal of Finance  1997  On Persistence in Mutual Fund Performance   

                                            abstract authors  
0  Downloadable (with restrictions)!  Using a sam...    None  


In [27]:
print(df)


                                                   url             journal  \
0    https://ideas.repec.org/a/bla/jfinan/v52y1997i...  Journal of Finance   
1    https://ideas.repec.org/a/bla/jfinan/v25y1970i...  Journal of Finance   
2    https://ideas.repec.org/a/bla/jfinan/v80y2025i...  Journal of Finance   
3    https://ideas.repec.org/a/bla/jfinan/v80y2025i...  Journal of Finance   
4    https://ideas.repec.org/a/bla/jfinan/v80y2025i...  Journal of Finance   
..                                                 ...                 ...   
196  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   
197  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   
198  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   
199  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   
200  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   

        year                                              title

In [14]:
df.to_csv('test.csv')