Extract the working papers data from Repec idea

In [1]:
# Install required packages
!pip install requests beautifulsoup4 pandas scrapy langchain openai tqdm

Collecting scrapy
  Downloading scrapy-2.13.2-py3-none-any.whl.metadata (4.4 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting itemadapter>=0.1.0 (from scrapy)
  Downloading itemadapter-0.11.0-py3-none-any.whl.metadata (18 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting protego>=0.1.15 (from scrapy)
  Downloading protego-0.5.0-py3-none-any.whl.metadata (6.4 kB)
Collecting pydispatcher>=2.0.5 (from scrapy)
  Downloading PyDispatcher-2.0.7-py3-none-any.whl.metadata (2.4 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.8.0-py3-none-any.whl.metadata (6.1 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting tldextract (from scrapy)
  Do

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin
import logging
from tqdm import tqdm
import random
import torch
import numpy as np

In [60]:
class RepecPaperScraper:
    def __init__(self, base_url="https://ideas.repec.org"):
        self.base_url = base_url
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'AcademicResearchBot/1.0 (+https://example.edu/research)',
            'Accept-Language': 'en-US,en;q=0.5',
        })
        self.papers_data = []
        self.processed_urls = set()
        self.current_page = 1
        self.max_pages = None
        self.series_code = None

    def clean_text(self, text):
        """Clean and normalize text data"""
        if not text:
            return ""
        # Replace common encoding issues
        replacements = {
            'â€“': '-', 'â€˜': "'", 'â€™': "'",
            'â€œ': '"', 'â€': '"', 'Ã©': 'é',
            'â€¢': '-', 'â€¦': '...'
        }
        for k, v in replacements.items():
            text = text.replace(k, v)
        text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
        return text.strip()

    def fetch_page(self, url):
        """Fetch a web page with error handling and delays"""
        try:
            time.sleep(random.uniform(1, 3))  # Random delay
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return response.text if 'text/html' in response.headers.get('Content-Type', '') else None
        except requests.RequestException as e:
            logging.error(f"Error fetching {url}: {e}")
            return None

    def get_next_page_url(self, current_url):
        """Generate next page URL in sequential order"""
        if not current_url:
            return None

        if current_url.endswith('.html'):
            base = current_url.rsplit('.', 1)[0]
            if any(base.endswith(str(i)) for i in range(10)):
                base = base.rstrip('0123456789')
            return f"{base}{self.current_page + 1}.html"
        return None

    def parse_paper_listing(self, html):
        """Parse listing page with complete title extraction"""
        soup = BeautifulSoup(html, 'html.parser')
        papers = []

        for item in soup.select('li.list-group-item.downfree'):
            if not item.text.strip():
                continue

            try:
                # Extract the complete title with ID
                title_tag = item.find('a', href=re.compile(r'^/p/'))
                if not title_tag:
                    continue

                full_text = self.clean_text(title_tag.get_text())

                # Handle both formats: "id:12345 Title" and "Title"
                if full_text.startswith('id:'):
                    paper_id = full_text.split(':')[1].split()[0].strip()
                    paper_title = full_text.split(':')[1].split(' ', 1)[1].strip()
                else:
                    paper_id = ''
                    paper_title = full_text

                # Extract authors
                authors = []
                by_tag = item.find('i', text=re.compile('by', re.I))
                if by_tag and by_tag.next_sibling:
                    authors_text = self.clean_text(by_tag.next_sibling)
                    authors = [a.strip() for a in authors_text.split('&')]

                paper = {
                    'id': paper_id,
                    'title': paper_title,
                    'url': urljoin(self.base_url, title_tag['href']),
                    'authors': authors,
                    'page_number': self.current_page
                }

                if paper['title'] and paper['url']:
                    papers.append(paper)

            except Exception as e:
                logging.error(f"Error parsing item: {e}")

        return papers

    def parse_individual_paper(self, paper):
        """Get detailed info from individual paper page"""
        if not paper.get('url') or paper['url'] in self.processed_urls:
            return None

        html = self.fetch_page(paper['url'])
        if not html:
            return None

        try:
            soup = BeautifulSoup(html, 'html.parser')

            # Get the definitive title from the h1 tag
            title_tag = soup.find('h1')
            if title_tag:
                paper['title'] = self.clean_text(title_tag.get_text())

            # Extract abstract
            abstract = ""
            abstract_div = soup.find('div', id='abstract-body')
            if abstract_div:
                abstract = self.clean_text(abstract_div.get_text())

            # Extract download URL
            download_url = ""
            download_input = soup.find('input', {'type': 'radio', 'checked': True})
            if download_input:
                download_url = download_input.get('value', '')

            # Extract handle
            handle = ""
            handle_tag = soup.find('i', style='word-break:break-all')
            if handle_tag:
                handle = self.clean_text(handle_tag.get_text())

            # Extract keywords
            keywords = []
            more_div = soup.find('div', id='more')
            if more_div:
                keywords = [self.clean_text(kw.get_text()) for kw in
                          more_div.find_all('a', href=re.compile(r'htsearch2'))]

            # Extract suggested citation
            citation = ""
            biblio_div = soup.find('div', id='biblio-body')
            if biblio_div:
                citation_tag = biblio_div.find('li', class_='list-group-item')
                if citation_tag:
                    citation = self.clean_text(citation_tag.get_text())

            # Update paper with all details
            paper.update({
                'abstract': abstract,
                'download_url': download_url,
                'handle': handle,
                'keywords': keywords,
                'suggested_citation': citation
            })

            self.processed_urls.add(paper['url'])
            return paper

        except Exception as e:
            logging.error(f"Error parsing {paper['url']}: {e}")
            return None

    def scrape_series(self, series_code, max_papers=500, max_pages=None):
        """Main scraping function with proper pagination"""
        self.series_code = series_code
        self.max_pages = max_pages
        current_url = f"{self.base_url}/s/{series_code}.html"
        self.current_page = 1
        consecutive_empty = 0

        with tqdm(total=max_papers, desc=f"Scraping {series_code}") as pbar:
            while current_url and len(self.papers_data) < max_papers:
                if self.max_pages and self.current_page > self.max_pages:
                    break

                html = self.fetch_page(current_url)
                if not html:
                    break

                # Check for end of papers
                if "No items" in html or not html.strip():
                    consecutive_empty += 1
                    if consecutive_empty >= 2:
                        break
                    current_url = self.get_next_page_url(current_url)
                    self.current_page += 1
                    continue

                papers = self.parse_paper_listing(html)
                if not papers:
                    consecutive_empty += 1
                    if consecutive_empty >= 2:
                        break
                    current_url = self.get_next_page_url(current_url)
                    self.current_page += 1
                    continue

                consecutive_empty = 0
                for paper in papers:
                    if len(self.papers_data) >= max_papers:
                        break

                    detailed = self.parse_individual_paper(paper)
                    if detailed:
                        self.papers_data.append(detailed)
                        pbar.update(1)

                        # Save checkpoint every 20 papers
                        if len(self.papers_data) % 20 == 0:
                            self._save_checkpoint()

                current_url = self.get_next_page_url(current_url)
                self.current_page += 1

        self._save_checkpoint()
        return [p for p in self.papers_data if p.get('title')]

    def _save_checkpoint(self):
        """Save progress to resume later"""
        try:
            df = pd.DataFrame(self.papers_data)
            # Ensure all expected columns exist
            for col in ['id', 'title', 'authors', 'abstract', 'keywords',
                       'url', 'download_url', 'handle', 'suggested_citation', 'page_number']:
                if col not in df.columns:
                    df[col] = ""

            df.to_csv(f'repec_checkpoint_{self.series_code.replace("/","_")}.csv',
                     index=False, encoding='utf-8-sig')
            logging.info(f"Checkpoint saved with {len(df)} papers")
        except Exception as e:
            logging.error(f"Checkpoint save failed: {e}")

    def save_to_csv(self, filename):
        """Final save with comprehensive data cleaning"""
        clean_data = []
        for paper in self.papers_data:
            # Skip papers without title or URL
            if not paper.get('title') or not paper.get('url'):
                continue

            # Clean all fields
            cleaned = {
                'id': self.clean_text(paper.get('id', '')),
                'title': self.clean_text(paper.get('title', '')),
                'authors': ', '.join([self.clean_text(a) for a in paper.get('authors', [])]),
                'abstract': self.clean_text(paper.get('abstract', '')),
                'keywords': ', '.join([self.clean_text(kw) for kw in paper.get('keywords', [])]),
                'url': paper.get('url', ''),
                'download_url': paper.get('download_url', ''),
                'handle': self.clean_text(paper.get('handle', '')),
                'suggested_citation': self.clean_text(paper.get('suggested_citation', '')),
                'page_number': paper.get('page_number', 0)
            }
            clean_data.append(cleaned)

        if not clean_data:
            logging.warning("No valid papers to save")
            return

        df = pd.DataFrame(clean_data)

        # Ensure consistent column order
        cols = ['id', 'title', 'authors', 'abstract', 'keywords',
                'url', 'download_url', 'handle', 'suggested_citation', 'page_number']
        df = df[cols]

        # Save with UTF-8 encoding to handle special characters
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        logging.info(f"Saved {len(df)} papers to {filename}")

if __name__ == "__main__":
    scraper = RepecPaperScraper()
    series_code = "ess/wpaper"
    #series_code = "iim/iimawp"

    papers = scraper.scrape_series(series_code, max_papers=600)
    #papers = scraper.scrape_series(series_code, max_papers=float('inf'), max_pages=4)
    scraper.save_to_csv("repec_papers_scrap1.csv")

Scraping ess/wpaper: 100%|██████████| 600/600 [20:52<00:00,  2.09s/it]


In [4]:
# Load the data from the checkpoint CSV file
try:
    df_checkpoint = pd.read_csv('repec_papers_scrap1.csv')

    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.expand_frame_repr', False):
        print(df_checkpoint.head())

except FileNotFoundError:
    print("Checkpoint file 'repec_papers_scrap1.csv' not found. Run the scraper first.")
except Exception as e:
    print(f"An error occurred while loading or displaying the data: {e}")

   id                                              title                           authors                                           abstract                                           keywords                                                url                                       download_url                     handle                                 suggested_citation  page_number
0 NaN  3rd Urban Economy Forum 2021:The Brampton Reso...  Urban Economy Forum 2021 UEF2021  The 3rd Urban Economy Forum is one of the worl...  urban economy, UEF 2021, urban development, ur...  https://ideas.repec.org/p/ess/wpaper/id13172.html  http://www.esocialsciences.org/Download/repecD...  RePEc:ess:wpaper:id:13172  Urban Economy Forum 2021 UEF2021, 2022. "3rd U...            1
1 NaN  The Humanities of Crisis: Climate Change and t...                   Pramod K. Nayar  From scientific upheavals in the Early Modern ...  humanities, crisis, climate change, COP26, All...  https://ideas.repec.org/p/ess/wpap

In [5]:
df = df_checkpoint
df.head(600)

Unnamed: 0,id,title,authors,abstract,keywords,url,download_url,handle,suggested_citation,page_number
0,,3rd Urban Economy Forum 2021:The Brampton Reso...,Urban Economy Forum 2021 UEF2021,The 3rd Urban Economy Forum is one of the worl...,"urban economy, UEF 2021, urban development, ur...",https://ideas.repec.org/p/ess/wpaper/id13172.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13172,"Urban Economy Forum 2021 UEF2021, 2022. ""3rd U...",1
1,,The Humanities of Crisis: Climate Change and t...,Pramod K. Nayar,From scientific upheavals in the Early Modern ...,"humanities, crisis, climate change, COP26, All...",https://ideas.repec.org/p/ess/wpaper/id13166.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13166,"Pramod K. Nayar, 2021. ""The Humanities of Cris...",1
2,,Climate finance for Cities and Urban Governments,S Ananthakrishnan,Ambitious actions taken to reduce urban emissi...,"COP26, climate change, finance for climate cha...",https://ideas.repec.org/p/ess/wpaper/id13165.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13165,"S Ananthakrishnan, 2021. ""Climate finance for ...",1
3,,Know your Publishing Space: Preprints: Boon or...,Shubhada Nagarkar,"This essay addresses preprints, their advantag...","preprints, publishing space, India, All these ...",https://ideas.repec.org/p/ess/wpaper/id13163.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13163,"Shubhada Nagarkar, 2021. ""Know your Publishing...",1
4,,eSSay:The Managerial University and Liberal Ar...,Pramod K. Nayar,The managerialism that marks the HEIs has alte...,"managerial university, New Educational Policy,...",https://ideas.repec.org/p/ess/wpaper/id13161.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13161,"Pramod K. Nayar, 2021. ""eSSay:The Managerial U...",1
...,...,...,...,...,...,...,...,...,...,...
95,,UDAY Power Debt in Retrospect and Prospects: A...,"Amandeep Kaur, Lekha Chakraborty",The Government of India launched the Ujwal DIS...,"eSS, Power infrastructure, Power Debt, Bonds, ...",https://ideas.repec.org/p/ess/wpaper/id12968.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12968,"Amandeep Kaur & Lekha Chakraborty, 2019. ""UDAY...",1
96,,Analyzing the Dynamic Relationship between Phy...,"Ranjan Kumar Mohanty, N. R. Bhanumurthy",The paper investigates dynamic relationship be...,"eSS, financial sector, Infrastructure Index, F...",https://ideas.repec.org/p/ess/wpaper/id12967.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12967,"Ranjan Kumar Mohanty & N. R. Bhanumurthy, 2019...",1
97,,"Disclosures in Privacy Policies: Does ""Notice ...","Rishab Bailey, Smriti Parsheera, Faiza Rahman,...",This paper evaluates the quality of privacy po...,"eSS, disclosures, privacy policy, notice, cons...",https://ideas.repec.org/p/ess/wpaper/id12966.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12966,Rishab Bailey & Smriti Parsheera & Faiza Rahma...,1
98,,Value Destruction and Wealth Transfer Under th...,Pratik Datta,This article applies theoretical concepts from...,"eSS, value destruction, insolvency, bankruptcy...",https://ideas.repec.org/p/ess/wpaper/id12965.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12965,"Pratik Datta, 2019. ""Value Destruction and Wea...",1


LLM for filtering economic relevant papers

zero-shot classification model

In [16]:
import os
from transformers import pipeline

# -- Configuration --
ALL_PAPERS_INPUT_FILE = "repec_papers_scrap1.csv"  # CSV input file
FILTERED_OUTPUT_FILE = "repec_papers_economics_filtered.csv"  # CSV output file

def filter_papers_for_economics_colab():
    """
    Reads a CSV file and uses a zero-shot classification model
    to filter for papers related to economics.
    """
    print("\n--- FILTERING PAPERS FOR ECONOMIC RELEVANCE ---")

    if not os.path.exists(ALL_PAPERS_INPUT_FILE):
        print(f"Error: Input CSV file not found at '{ALL_PAPERS_INPUT_FILE}'.")
        return

    print(f"Loading data from {ALL_PAPERS_INPUT_FILE}...")
    df = pd.read_csv(ALL_PAPERS_INPUT_FILE)

    if df.empty:
        print("No papers found in the input file to filter.")
        return

    print(f"Loaded {len(df)} papers to filter.")
    print("Initializing zero-shot classification model...")

    try:
        classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
        print("Model initialized successfully.")
    except Exception as e:
        print(f"Failed to initialize the model. Please ensure you have an internet connection.")
        print(f"Error: {e}")
        return

    candidate_labels = ["economics", "not economics"]
    CONFIDENCE_THRESHOLD = 0.80
    print(f"Filtering papers with a confidence threshold of {CONFIDENCE_THRESHOLD*100}%...")

    filtered_rows = []

    for i, row in df.iterrows():
        text_to_classify = f"{str(row.get('title', ''))}. {str(row.get('abstract', ''))}"

        if len(text_to_classify.strip()) < 20:
            continue

        try:
            result = classifier(text_to_classify, candidate_labels, multi_label=False)
            top_label = result['labels'][0]
            top_score = result['scores'][0]

            print(f"  ({i+1}/{len(df)}) Classifying '{row.get('title', 'No Title')[:60]}...' -> {top_label} ({top_score:.2f})")

            if top_label == "economics" and top_score >= CONFIDENCE_THRESHOLD:
                filtered_rows.append(row)

        except Exception as e:
            print(f"Could not classify paper: {e}")
            continue

    print(f"\nFiltering complete. Found {len(filtered_rows)} papers relevant to economics.")

    if filtered_rows:
        pd.DataFrame(filtered_rows).to_csv(FILTERED_OUTPUT_FILE, index=False)
        print(f"\n--- Filtered data saved to '{FILTERED_OUTPUT_FILE}' ---")
    else:
        print("No papers met the filtering criteria to be saved.")

# -- Main execution block --
if __name__ == "__main__":
    filter_papers_for_economics_colab()


--- FILTERING PAPERS FOR ECONOMIC RELEVANCE ---
Loading data from repec_papers_scrap1.csv...
Loaded 100 papers to filter.
Initializing zero-shot classification model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Model initialized successfully.
Filtering papers with a confidence threshold of 80.0%...
  (1/100) Classifying '3rd Urban Economy Forum 2021:The Brampton Resolution...' -> economics (0.79)
  (2/100) Classifying 'The Humanities of Crisis: Climate Change and the Discipline...' -> not economics (0.63)
  (3/100) Classifying 'Climate finance for Cities and Urban Governments...' -> economics (0.69)
  (4/100) Classifying 'Know your Publishing Space: Preprints: Boon or Bane?...' -> not economics (0.55)
  (5/100) Classifying 'eSSay:The Managerial University and Liberal Arts' Balancing ...' -> not economics (0.67)
  (6/100) Classifying 'Technology, Globalisation and Multinationals The Asian Exper...' -> economics (0.57)
  (7/100) Classifying 'Know Your Publishing Space: Predatory Journals: Publish and ...' -> not economics (0.68)
  (8/100) Classifying 'PAISA for Municipalities: A Study of Fund Flows and Public E...' -> economics (0.80)
  (9/100) Classifying 'Creating Udyog Sahayak Enterprises Ne

In [18]:
df3 = pd.read_csv('/content/repec_papers_economics_filtered.csv')
df3

Unnamed: 0,id,title,authors,abstract,keywords,url,download_url,handle,suggested_citation,page_number
0,,Creating Udyog Sahayak Enterprises Network (US...,"T Muralidharan, G.D Bino Paul, Amit Basole",There are multiple forces - thirteen of them i...,"employment, youth, MSME, micro and small enter...",https://ideas.repec.org/p/ess/wpaper/id13136.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13136,"T Muralidharan & G.D Bino Paul & Amit Basole, ...",1
1,,IMF and Big Business for Universal Vaccination...,Prabir Purkayastha,The me-first policy on vaccine sharing will br...,"IMF, vaccine, pharmaceutical industry, COVID, ...",https://ideas.repec.org/p/ess/wpaper/id13135.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13135,"Prabir Purkayastha, 2021. ""IMF and Big Busines...",1
2,,Saving Lives and Livelihoods 01 Amidst a Once-...,Ministry of Finance,"During the unlock phase, demand-side measures ...","pandemic, Covid-19, Kerala, Maharashtra, GDP, ...",https://ideas.repec.org/p/ess/wpaper/id13133.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13133,"Ministry of Finance, 2021. ""Saving Lives and L...",1
3,,Budget Speech of Kerala Finance Minister 2021-22,T.M. Thomas Issac,Kerala Budget presented by the Finance Minister.,"Budget, Kerala, COVID, All these keywords",https://ideas.repec.org/p/ess/wpaper/id13129.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13129,"T.M. Thomas Issac, 2021. ""Budget Speech of Ker...",1
4,,Is India Creating Adequate Jobs Post 2000: Tre...,"G.D Bino Paul, Muralidharan T",This paper looks into diverse databases to gau...,"employment, employment elasticity, jobs, manpo...",https://ideas.repec.org/p/ess/wpaper/id13093.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13093,"G.D Bino Paul & Muralidharan T, 2020. ""Is Indi...",1
...,...,...,...,...,...,...,...,...,...,...
57,,Exporting and Firm Performance: Evidence from ...,"Apoorva Gupta, Ila Patnaik, Ajay Shah",In this paper the positive correlation between...,"eSS, export, firms, firm productivity, export ...",https://ideas.repec.org/p/ess/wpaper/id12969.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12969,"Apoorva Gupta & Ila Patnaik & Ajay Shah, 2019....",1
58,,UDAY Power Debt in Retrospect and Prospects: A...,"Amandeep Kaur, Lekha Chakraborty",The Government of India launched the Ujwal DIS...,"eSS, Power infrastructure, Power Debt, Bonds, ...",https://ideas.repec.org/p/ess/wpaper/id12968.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12968,"Amandeep Kaur & Lekha Chakraborty, 2019. ""UDAY...",1
59,,Analyzing the Dynamic Relationship between Phy...,"Ranjan Kumar Mohanty, N. R. Bhanumurthy",The paper investigates dynamic relationship be...,"eSS, financial sector, Infrastructure Index, F...",https://ideas.repec.org/p/ess/wpaper/id12967.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12967,"Ranjan Kumar Mohanty & N. R. Bhanumurthy, 2019...",1
60,,Value Destruction and Wealth Transfer Under th...,Pratik Datta,This article applies theoretical concepts from...,"eSS, value destruction, insolvency, bankruptcy...",https://ideas.repec.org/p/ess/wpaper/id12965.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12965,"Pratik Datta, 2019. ""Value Destruction and Wea...",1


sentence embedding + cosine similarity approach

In [6]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [37]:
# Load embedding model (fast, CPU-friendly)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define economics reference vector
econ_keywords = [
    "economics", "econometric", "microeconomics", "macroeconomics",
    "trade", "GDP", "GNP", "inflation", "deflation", "unemployment",
    "monetary policy", "fiscal policy", "central bank", "interest rate",
    "exchange rate", "trade balance", "productivity", "labor market",
    "supply and demand", "elasticity", "market structure", "game theory",
    "welfare economics", "public economics", "development economics",
    "international trade", "financial economics", "behavioral economics",
    "industrial organization", "economy", "economic growth",
    "recession", "depression", "stimulus", "austerity", "quantitative easing",
    "consumer price index", "purchasing power", "human capital"]
econ_vectors = [model.encode(kw, convert_to_tensor=True) for kw in econ_keywords]
econ_vector = torch.mean(torch.stack(econ_vectors), dim=0)

def is_economics_related(title, abstract, threshold=0.30):
    text = f"{title}. {abstract}"
    text_vector = model.encode(text, convert_to_tensor=True)
    similarity = util.cos_sim(text_vector, econ_vector).item()
    return similarity >= threshold

In [38]:
df = pd.read_csv("repec_papers_scrap1.csv")
df['Is_Economics'] = False

for i in tqdm(df.index):
    title = str(df.loc[i, 'title'])
    abstract = str(df.loc[i, 'abstract'])
    df.loc[i, 'Is_Economics'] = is_economics_related(title, abstract)

df.to_csv("econ_flag_fast.csv", index=False)
df[df['Is_Economics']].to_csv("econ_filtered_fast.csv", index=False)

100%|██████████| 100/100 [00:10<00:00,  9.76it/s]


In [49]:
#df2 = pd.read_csv('/content/econ_filtered_fast.csv')
df2 = pd.read_csv('/content/econ_flag_fast.csv')

df2

Unnamed: 0,id,title,authors,abstract,keywords,url,download_url,handle,suggested_citation,page_number,Is_Economics
0,,3rd Urban Economy Forum 2021:The Brampton Reso...,Urban Economy Forum 2021 UEF2021,The 3rd Urban Economy Forum is one of the worl...,"urban economy, UEF 2021, urban development, ur...",https://ideas.repec.org/p/ess/wpaper/id13172.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13172,"Urban Economy Forum 2021 UEF2021, 2022. ""3rd U...",1,False
1,,The Humanities of Crisis: Climate Change and t...,Pramod K. Nayar,From scientific upheavals in the Early Modern ...,"humanities, crisis, climate change, COP26, All...",https://ideas.repec.org/p/ess/wpaper/id13166.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13166,"Pramod K. Nayar, 2021. ""The Humanities of Cris...",1,False
2,,Climate finance for Cities and Urban Governments,S Ananthakrishnan,Ambitious actions taken to reduce urban emissi...,"COP26, climate change, finance for climate cha...",https://ideas.repec.org/p/ess/wpaper/id13165.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13165,"S Ananthakrishnan, 2021. ""Climate finance for ...",1,False
3,,Know your Publishing Space: Preprints: Boon or...,Shubhada Nagarkar,"This essay addresses preprints, their advantag...","preprints, publishing space, India, All these ...",https://ideas.repec.org/p/ess/wpaper/id13163.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13163,"Shubhada Nagarkar, 2021. ""Know your Publishing...",1,False
4,,eSSay:The Managerial University and Liberal Ar...,Pramod K. Nayar,The managerialism that marks the HEIs has alte...,"managerial university, New Educational Policy,...",https://ideas.repec.org/p/ess/wpaper/id13161.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:13161,"Pramod K. Nayar, 2021. ""eSSay:The Managerial U...",1,False
...,...,...,...,...,...,...,...,...,...,...,...
95,,UDAY Power Debt in Retrospect and Prospects: A...,"Amandeep Kaur, Lekha Chakraborty",The Government of India launched the Ujwal DIS...,"eSS, Power infrastructure, Power Debt, Bonds, ...",https://ideas.repec.org/p/ess/wpaper/id12968.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12968,"Amandeep Kaur & Lekha Chakraborty, 2019. ""UDAY...",1,False
96,,Analyzing the Dynamic Relationship between Phy...,"Ranjan Kumar Mohanty, N. R. Bhanumurthy",The paper investigates dynamic relationship be...,"eSS, financial sector, Infrastructure Index, F...",https://ideas.repec.org/p/ess/wpaper/id12967.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12967,"Ranjan Kumar Mohanty & N. R. Bhanumurthy, 2019...",1,True
97,,"Disclosures in Privacy Policies: Does ""Notice ...","Rishab Bailey, Smriti Parsheera, Faiza Rahman,...",This paper evaluates the quality of privacy po...,"eSS, disclosures, privacy policy, notice, cons...",https://ideas.repec.org/p/ess/wpaper/id12966.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12966,Rishab Bailey & Smriti Parsheera & Faiza Rahma...,1,False
98,,Value Destruction and Wealth Transfer Under th...,Pratik Datta,This article applies theoretical concepts from...,"eSS, value destruction, insolvency, bankruptcy...",https://ideas.repec.org/p/ess/wpaper/id12965.html,http://www.esocialsciences.org/Download/repecD...,RePEc:ess:wpaper:id:12965,"Pratik Datta, 2019. ""Value Destruction and Wea...",1,True


Annexure: Trying with finetuning with a labeled data

In [52]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.nn.functional import softmax

ECON_KEYWORDS = [
    'economics', 'macroeconomics', 'microeconomics', 'fiscal policy', 'monetary policy',
    'GDP', 'inflation', 'unemployment', 'trade', 'investment', 'finance', 'poverty',
    'development', 'wage', 'budget', 'demand', 'supply', 'market', 'tax', 'inequality'
]

# Load Pretrained Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
).to(device)
model.eval()

#Helper Functions

def keyword_score(text):
    """Count keyword matches in lowercased text."""
    text = text.lower()
    return sum(kw in text for kw in ECON_KEYWORDS)

def predict_label(text, model, tokenizer, threshold=0.5):
    """Return prediction label and confidence from DistilBERT."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = softmax(outputs.logits, dim=1)

    econ_confidence = probs[0][1].item()  # 1 = economic, 0 = non-economic
    return econ_confidence, econ_confidence > threshold

def classify_paper(row, keyword_threshold=2, model_threshold=0.5):
    """Classify each paper based on title + abstract + keywords."""
    text = f"{row.get('title', '')}. {row.get('abstract', '')}. {row.get('keywords', '')}"
    kw_score = keyword_score(text)
    model_conf, model_pred = predict_label(text, model, tokenizer, threshold=model_threshold)

    # Hybrid logic: accept if keyword or model passes threshold
    final_label = (kw_score >= keyword_threshold) or model_pred
    return pd.Series({
        "keyword_score": kw_score,
        "model_confidence": round(model_conf, 4),
        "is_economic": final_label
    })

#Load Data
df = pd.read_csv("repec_papers_scrap1.csv")  # Must have title, abstract, keywords

# Classify All Papers
classified = df.apply(classify_paper, axis=1)
df = pd.concat([df, classified], axis=1)

# Save Results
df.to_csv("all_papers_with_predictions.csv", index=False)
df[df["is_economic"]].to_csv("economic_papers_only.csv", index=False)

print(f"""
 Classification Complete:
- Total papers processed: {len(df)}
- Economic papers found: {df['is_economic'].sum()}
- Saved to: all_papers_with_predictions.csv
- Filtered to: economic_papers_only.csv
""")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Classification Complete:
- Total papers processed: 100
- Economic papers found: 45
- Saved to: all_papers_with_predictions.csv
- Filtered to: economic_papers_only.csv



In [None]:
from transformers import (
    DistilBertTokenizer, DistilBertForSequenceClassification,
    Trainer, TrainingArguments
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.nn.functional import softmax

# Keyword bank
ECON_KEYWORDS = [
    'economics', 'macroeconomics', 'microeconomics', 'fiscal policy', 'monetary policy',
    'GDP', 'inflation', 'unemployment', 'trade', 'investment', 'finance', 'poverty',
    'development', 'wage', 'budget', 'demand', 'supply', 'market', 'tax', 'inequality'
]

# Load tokenizer & model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
).to(device)

# Load labeled data
df = pd.read_csv("labeled_data.csv")  # expects "text" and "label" columns (label: 0 or 1)

# Split for training and eval
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize
def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True).rename_column("label", "labels")
eval_dataset = Dataset.from_pandas(eval_df).map(tokenize, batched=True).rename_column("label", "labels")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Fine-tuning config
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=5
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)
import os
os.environ["WANDB_MODE"] = "offline"

# Train
trainer.train()
trainer.save_model("fine_tuned_model.pt")

print("\n Fine-tuning complete. Saved as 'fine_tuned_model.pt'.")