In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

def scrape_repec_journal(base_url: str, max_page: int = 10, delay: float = 1.0) -> pd.DataFrame:
    def get_article_links_paginated(base_url, max_page):
        all_links = []
        for i in range(1, max_page + 1):
            if i == 1:
                url = base_url
            else:
                url = base_url.replace(".html", f"{i}.html")

            print(f"🌐 Fetching: {url}")
            resp = requests.get(url)
            if resp.status_code != 200:
                print(f"❌ Page {i} failed with status {resp.status_code}")
                break

            soup = BeautifulSoup(resp.text, 'html.parser')
            page_links = [
                'https://ideas.repec.org' + a['href']
                for a in soup.select('ul li a')
                if a.has_attr('href') and a['href'].startswith('/a/')
            ]
            print(f"🔗 Found {len(page_links)} articles")
            all_links.extend(page_links)
            time.sleep(delay)

        return list(set(all_links))

    def parse_article(article_url):
        try:
            resp = requests.get(article_url)
            soup = BeautifulSoup(resp.text, 'html.parser')

            # Abstract
            abstract = None
            meta = soup.find('meta', attrs={'name': 'description'})
            if meta and meta.get('content'):
                raw = meta['content'].strip()
                abstract = re.sub(r'^Downloadable(\s*\(with restrictions\))?!\s*', '', raw)

            authors = year = title = journal = None

            h2 = soup.find('h2', string=re.compile(r'Suggested Citation'))
            if h2:
                citation_div = h2.find_next_sibling('div', id='biblio-body')
                if citation_div:
                    li = citation_div.find('li', class_=lambda c: c and 'list-group-item' in c)
                    if li:
                        text_nodes = [n for n in li.contents if isinstance(n, str)]
                        if text_nodes:
                            raw = text_nodes[0].strip()
                            if ',' in raw:
                                author_part, year_part = raw.rsplit(',', 1)
                                authors = author_part.strip()
                                match = re.search(r'\b(19|20)\d{2}\b', year_part)
                                year = match.group(0) if match else None

                        b = li.find('b')
                        if b:
                            a = b.find('a')
                            title = a.get_text(strip=True) if a else b.get_text(strip=True)

                        a_tags = li.find_all('a')
                        if len(a_tags) >= 2:
                            journal = a_tags[1].get_text(strip=True)

            return {
                'url': article_url,
                'journal': journal,
                'year': year,
                'title': title,
                'abstract': abstract,
                'authors': authors
            }
        except Exception as e:
            print(f"❌ Failed to parse {article_url}: {e}")
            return None

    # ---- Run scraping ----
    links = get_article_links_paginated(base_url, max_page)
    print(f"📦 Total articles to parse: {len(links)}")

    records = []
    for i, url in enumerate(links):
        print(f"🔄 ({i+1}/{len(links)}): {url}")
        data = parse_article(url)
        if data:
            records.append(data)
        time.sleep(delay)

    df = pd.DataFrame(records)
    return df


In [None]:
# Example: JF/JFE/RFS
jf = scrape_repec_journal("https://ideas.repec.org/s/bla/jfinan.html", max_page=5)
jfe = scrape_repec_journal("https://ideas.repec.org/s/eee/jfinec.html", max_page=5)
rfs = scrape_repec_journal("https://ideas.repec.org/s/oup/rfinst.html", max_page=5)
fin_top3 = pd.concat([jf, jfe, rfs])
fin_top3.to_csv("fin_top3_scraped_up_to_page5.csv", index=False)


🌐 Fetching: https://ideas.repec.org/s/bla/jfinan.html
🔗 Found 201 articles
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan2.html
🔗 Found 200 articles
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan3.html
🔗 Found 199 articles
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan4.html
🔗 Found 200 articles
🌐 Fetching: https://ideas.repec.org/s/bla/jfinan5.html
🔗 Found 200 articles
📦 Total articles to parse: 979
🔄 (1/979): https://ideas.repec.org/a/bla/jfinan/v68y2013i2p715-737.html
🔄 (2/979): https://ideas.repec.org/a/bla/jfinan/v79y2024i2p843-902.html
🔄 (3/979): https://ideas.repec.org/a/bla/jfinan/v80y2025i1p5-56.html
🔄 (4/979): https://ideas.repec.org/a/bla/jfinan/v66y2011i2p519-561.html
🔄 (5/979): https://ideas.repec.org/a/bla/jfinan/v68y2013i1p173-200.html
🔄 (6/979): https://ideas.repec.org/a/bla/jfinan/v69y2014i1p1-49.html
🔄 (7/979): https://ideas.repec.org/a/bla/jfinan/v70y2015i5p2309-2346.html
🔄 (8/979): https://ideas.repec.org/a/bla/jfinan/v77y2022i4p2287-2329.html
🔄 (9/979): https

In [6]:
fin_top3

Unnamed: 0,url,journal,year,title,abstract,authors
0,https://ideas.repec.org/a/bla/jfinan/v68y2013i...,Journal of Finance,2013,How Effective Were the Federal Reserve Emergen...,Following the failure of Lehman Brothers in Se...,Burcu Duygan-Bump & Patrick Parkinson & Eric R...
1,https://ideas.repec.org/a/bla/jfinan/v79y2024i...,Journal of Finance,2024,Measuring “Dark Matter” in Asset Pricing Models,We formalize the concept of “dark matter” in a...,Hui Chen & Winston Wei Dou & Leonid Kogan
2,https://ideas.repec.org/a/bla/jfinan/v80y2025i...,Journal of Finance,2025,"Bank Funding Risk, Reference Rates, and Credit...",Corporate credit lines are drawn more heavily ...,Harry Cooperman & Darrell Duffie & Stephan Luc...
3,https://ideas.repec.org/a/bla/jfinan/v66y2011i...,Journal of Finance,2011,The Joy of Giving or Assisted Living? Using St...,No abstract is available for this item.,John Ameriks & Andrew Caplin & Steven Laufer &...
4,https://ideas.repec.org/a/bla/jfinan/v68y2013i...,Journal of Finance,2013,Conflicting Family Values in Mutual Fund Families,No abstract is available for this item.,Utpal Bhattacharya & Jung H. Lee & Veronika K....
...,...,...,...,...,...,...
988,https://ideas.repec.org/a/oup/rfinst/v33y2020i...,The Review of Financial Studies,2020,Derivatives Supply and Corporate Hedging: Evid...,This article analyzes the importance of supply...,Erasmo Giambona & Ye Wang & Philip Strahan
989,https://ideas.repec.org/a/oup/rfinst/v35y2022i...,The Review of Financial Studies,2022,Consuming Dividends,This paper studies why investors buy dividend-...,Konstantin Bräuer & Andreas Hackethal & Tobin ...
990,https://ideas.repec.org/a/oup/rfinst/v31y2018i...,The Review of Financial Studies,2018,Does a Larger Menu Increase Appetite? Collater...,We examine a change in the European Central Ba...,Sjoerd Van Bekkum & Marc Gabarro & Rustom M. I...
991,https://ideas.repec.org/a/oup/rfinst/v31y2018i...,The Review of Financial Studies,2018,Open-End Organizational Structures and Limits ...,We provide evidence that open-end organization...,Mariassunta Giannetti & Bige Kahraman


In [3]:
# Example: Journal of Finance
df2 = scrape_repec_journal("https://ideas.repec.org/s/eee/jfinec.html", max_page=3)
df = pd.concat([df, df2])


🌐 Fetching: https://ideas.repec.org/s/eee/jfinec.html
🔗 Found 201 articles
🌐 Fetching: https://ideas.repec.org/s/eee/jfinec2.html
🔗 Found 200 articles
🌐 Fetching: https://ideas.repec.org/s/eee/jfinec3.html
🔗 Found 200 articles
📦 Total articles to parse: 596
🔄 (1/596): https://ideas.repec.org/a/eee/jfinec/v148y2023i1p47-68.html
🔄 (2/596): https://ideas.repec.org/a/eee/jfinec/v141y2021i3p881-895.html
🔄 (3/596): https://ideas.repec.org/a/eee/jfinec/v141y2021i2p620-643.html
🔄 (4/596): https://ideas.repec.org/a/eee/jfinec/v165y2025ics0304405x24002149.html
🔄 (5/596): https://ideas.repec.org/a/eee/jfinec/v141y2021i1p234-254.html
🔄 (6/596): https://ideas.repec.org/a/eee/jfinec/v140y2021i3p815-837.html
🔄 (7/596): https://ideas.repec.org/a/eee/jfinec/v145y2022i1p45-68.html
🔄 (8/596): https://ideas.repec.org/a/eee/jfinec/v140y2021i3p894-915.html
🔄 (9/596): https://ideas.repec.org/a/eee/jfinec/v150y2023i2s0304405x23001605.html
🔄 (10/596): https://ideas.repec.org/a/eee/jfinec/v145y2022i3p802-826.ht

In [4]:
df

Unnamed: 0,url,journal,year,title,abstract,authors
0,https://ideas.repec.org/a/bla/jfinan/v79y2024i...,Journal of Finance,2024,Measuring “Dark Matter” in Asset Pricing Models,We formalize the concept of “dark matter” in a...,Hui Chen & Winston Wei Dou & Leonid Kogan
1,https://ideas.repec.org/a/bla/jfinan/v80y2025i...,Journal of Finance,2025,"Bank Funding Risk, Reference Rates, and Credit...",Corporate credit lines are drawn more heavily ...,Harry Cooperman & Darrell Duffie & Stephan Luc...
2,https://ideas.repec.org/a/bla/jfinan/v77y2022i...,Journal of Finance,2022,Debt Refinancing and Equity Returns,This paper presents empirical evidence that th...,Nils Friewald & Florian Nagler & Christian Wagner
3,https://ideas.repec.org/a/bla/jfinan/v71y2016i...,Journal of Finance,2016,Ties That Bind: How Business Connections Affec...,We investigate whether business ties with port...,Dragana Cvijanović & Amil Dasgupta & Konstanti...
4,https://ideas.repec.org/a/bla/jfinan/v73y2018i...,Journal of Finance,2018,Unscheduled News and Market Dynamics,"When unscheduled news arrives, investors react...",Jérôme Dugast
...,...,...,...,...,...,...
591,https://ideas.repec.org/a/eee/jfinec/v167y2025...,Journal of Financial Economics,2025,Expected idiosyncratic volatility,We use close to 80 million daily returns for m...,"Bekaert, Geert & Bergbrant, Mikael & Kassa, Ha..."
592,https://ideas.repec.org/a/eee/jfinec/v3y1976i4...,Journal of Financial Economics,1976,"Theory of the firm: Managerial behavior, agenc...",No abstract is available for this item.,"Jensen, Michael C. & Meckling, William H."
593,https://ideas.repec.org/a/eee/jfinec/v143y2022...,Journal of Financial Economics,2022,On the fast track: Information acquisition cos...,Using the introduction of high-speed rail (HSR...,"Chen, Deqiu & Ma, Yujing & Martin, Xiumin & Mi..."
594,https://ideas.repec.org/a/eee/jfinec/v141y2021...,Journal of Financial Economics,2021,Does common ownership really increase firm coo...,A growing number of studies suggest that commo...,"Lewellen, Katharina & Lowry, Michelle"


In [None]:
    df = scrape_journal(max_page=2)
    print(f"\n✅ Done. Extracted {len(df)} articles.")
    df.to_csv("jfe_articles_up_to_page10.csv", index=False)


In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def parse_article_flexible(article_url, journal_name="Journal of Finance"):
    response = requests.get(article_url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Title
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else None

    # Abstract
    abstract = None
    abstract_tag = soup.find("p", class_="abstract")
    if abstract_tag:
        abstract = abstract_tag.get_text(strip=True)
    else:
        # Fallback: look for <meta name="description">
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc and meta_desc.get("content"):
            abstract = meta_desc["content"].strip()

    # Authors
    # Authors
    authors = None
    authors_tag = soup.find("p", class_="authors")
    if authors_tag:
        authors = authors_tag.get_text(strip=True).replace("Authors:", "")
    else:
        # Try meta fallback
        author_tags = soup.find_all("meta", attrs={"name": "citation_author"})
        if author_tags:
            authors = "; ".join(tag["content"] for tag in author_tags if "content" in tag.attrs)
        else:
            # Look for 'p.title' with 'by XYZ'
            title_para = soup.find("p", class_="title")
            if title_para and "by " in title_para.text:
                authors = title_para.text.strip().replace("by ", "").strip()
            else:
                # Last resort: parse from "Suggested Citation"
                citation_block = soup.find("div", class_="note")
                if citation_block:
                    citation_text = citation_block.get_text(" ", strip=True)
                    parts = citation_text.split(".", 2)
                    if len(parts) >= 2:
                        authors = parts[0].strip()

    # Year
    year = None
    bibref_tag = soup.find("p", class_="bibref")
    if bibref_tag:
        for word in bibref_tag.text.split():
            if word.isdigit() and len(word) == 4:
                year = word
                break

    # Fallback: parse from URL
    if not year:
        import re
        match = re.search(r'y(\d{4})', article_url)
        if match:
            year = match.group(1)

    return {
        "journal": journal_name,
        "year": year,
        "title": title,
        "abstract": abstract,
        "authors": authors
    }


In [19]:
url = "https://ideas.repec.org/a/bla/jfinan/v52y1997i1p57-82.html"
article_data = parse_article_flexible(url)
df = pd.DataFrame([article_data])
print(df)


              journal  year                                      title  \
0  Journal of Finance  1997  On Persistence in Mutual Fund Performance   

                                            abstract authors  
0  Downloadable (with restrictions)!  Using a sam...    None  


In [27]:
print(df)


                                                   url             journal  \
0    https://ideas.repec.org/a/bla/jfinan/v52y1997i...  Journal of Finance   
1    https://ideas.repec.org/a/bla/jfinan/v25y1970i...  Journal of Finance   
2    https://ideas.repec.org/a/bla/jfinan/v80y2025i...  Journal of Finance   
3    https://ideas.repec.org/a/bla/jfinan/v80y2025i...  Journal of Finance   
4    https://ideas.repec.org/a/bla/jfinan/v80y2025i...  Journal of Finance   
..                                                 ...                 ...   
196  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   
197  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   
198  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   
199  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   
200  https://ideas.repec.org/a/bla/jfinan/v77y2022i...  Journal of Finance   

        year                                              title

In [14]:
df.to_csv('test.csv')