## Crawling

In [1]:
import requests
import threading
import time
import json
import os

# Constants
RESEARCH_TOPICS = ["Foundation Models", "Generative Models", "LLM", "VLM", "Diffusion Models"]
OUTPUT_DIR = "scraped_data"
YEAR_RANGES = [(2017, 2021), (2022, 2024)]  # (Start Year, End Year)
TOTAL_PAPERS_PER_TOPIC = 2000

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Function to fetch papers from Semantic Scholar API
def fetch_papers(topic, start_year, end_year, max_papers):
    api_url = "https://api.semanticscholar.org/graph/v1/paper/search"
    papers = []
    offset = 0

    while len(papers) < max_papers:
        if offset % 200 == 0:
            print(f"Fetched {offset}/{max_papers} papers for '{topic}' ({start_year}-{end_year})")

        params = {
            "query": topic,
            "fields": "title,abstract,authors,references,citations,year",
            "offset": offset,
            "limit": 100,
            "year": f"{start_year}-{end_year}",
            "sort": "relevance"
        }

        try:
            response = requests.get(api_url, params=params)

            if response.status_code == 429:
                print("Rate limit reached. Pausing for 10 seconds...")
                time.sleep(10)
                continue

            elif response.status_code == 504:
                print("504 Gateway Timeout. Retrying...")
                time.sleep(5)
                continue

            data = response.json().get("data", [])
            if not data:
                print(f"No more papers found for '{topic}' ({start_year}-{end_year}).")
                break

            for paper in data:
                papers.append({
                    "title": paper.get("title", "No Title"),
                    "abstract": paper.get("abstract", "No Abstract"),
                    "authors": [author.get("name", "Unknown") for author in paper.get("authors", [])],
                    "citations": len(paper.get("citations", [])),
                    "references": len(paper.get("references", [])),
                    "year": paper.get("year", "Unknown")
                })

                if len(papers) >= max_papers:
                    break

            offset += 100
            time.sleep(5)

        except requests.RequestException as e:
            print(f"Network error: {e}. Retrying in 10 seconds...")
            time.sleep(10)

    return papers

# Function to save data as JSON
def save_to_json(topic, year_range, papers):
    if not papers:
        print(f"No data to save for '{topic}' ({year_range})")
        return

    filename = os.path.join(OUTPUT_DIR, f"{topic.replace(' ', '_')}_{year_range}.json")
    with open(filename, "w", encoding="utf-8") as file:
        json.dump(papers, file, ensure_ascii=False, indent=4)

    print(f"Saved {len(papers)} papers to {filename}")

# Function to fetch and save papers for a given topic while adjusting year range compensation
def fetch_and_save_with_compensation(topic):
    total_fetched = 0
    all_papers = []

    # Fetch first range (2017-2021)
    start_year, end_year = YEAR_RANGES[0]
    papers_2017_2021 = fetch_papers(topic, start_year, end_year, 1000)
    total_fetched += len(papers_2017_2021)
    all_papers.extend(papers_2017_2021)
    save_to_json(topic, f"{start_year}-{end_year}", papers_2017_2021)

    # Fetch second range (2022-2024), adjusting the limit if needed
    start_year, end_year = YEAR_RANGES[1]
    remaining_papers_needed = TOTAL_PAPERS_PER_TOPIC - total_fetched
    if remaining_papers_needed > 0:
        papers_2022_2024 = fetch_papers(topic, start_year, end_year, remaining_papers_needed)
        all_papers.extend(papers_2022_2024)
        save_to_json(topic, f"{start_year}-{end_year}", papers_2022_2024)

    print(f"✅ {total_fetched}/{TOTAL_PAPERS_PER_TOPIC} papers fetched for '{topic}'")

# Multithreading function to scrape data for all topics
def fetch_and_save_papers_multithreaded():
    threads = []

    for topic in RESEARCH_TOPICS:
        thread = threading.Thread(target=fetch_and_save_with_compensation, args=(topic,))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    print("✅ Completed scraping all topics!")

# Run the scraper
if __name__ == "__main__":
    fetch_and_save_papers_multithreaded()


Fetched 0/1000 papers for 'Foundation Models' (2017-2021)Fetched 0/1000 papers for 'Generative Models' (2017-2021)

Fetched 0/1000 papers for 'LLM' (2017-2021)
Fetched 0/1000 papers for 'VLM' (2017-2021)
Fetched 0/1000 papers for 'Diffusion Models' (2017-2021)
Fetched 200/1000 papers for 'LLM' (2017-2021)
Fetched 200/1000 papers for 'VLM' (2017-2021)
Fetched 400/1000 papers for 'LLM' (2017-2021)
Fetched 400/1000 papers for 'VLM' (2017-2021)
No more papers found for 'VLM' (2017-2021).
Saved 391 papers to scraped_data\VLM_2017-2021.json
Fetched 0/1609 papers for 'VLM' (2022-2024)
Fetched 600/1000 papers for 'LLM' (2017-2021)
Fetched 200/1000 papers for 'Diffusion Models' (2017-2021)
Fetched 200/1000 papers for 'Foundation Models' (2017-2021)
Fetched 800/1000 papers for 'LLM' (2017-2021)
Fetched 200/1000 papers for 'Generative Models' (2017-2021)
Fetched 200/1609 papers for 'VLM' (2022-2024)
Fetched 1000/1000 papers for 'LLM' (2017-2021)
No more papers found for 'LLM' (2017-2021).
Saved 9

- **Objective**
    The goal of this stage is to **efficiently retrieve research papers** from the **Semantic Scholar API** by relevance, covering five research topics:  

    - **Foundation Models**  
    - **Generative Models**  
    - **LLM (Large Language Models)**  
    - **VLM (Vision-Language Models)**  
    - **Diffusion Models**  

    The script ensures **balanced data collection** from two year ranges \((2017-2021)\) and \((2022-2024)\), while **giving special attention to recent years** to reflect the latest advancements.  

- **Approach and Implementation**

  - **1. Data Collection Strategy**
    The API allows querying papers based on a topic and sorting them by relevance. To ensure we gather enough data:  

    - **Years 2017-2021**: We attempt to collect **1000 papers per topic**.  
    - **Years 2022-2024**: We aim for **1000 additional papers per topic**, **prioritizing recent research trends**.  
    - If **fewer than 1000 papers are found** in the first range, the second range **compensates** to ensure a total of **2000 papers per topic**.  

    $$
    P_{\text{total}} = P_{2017-2021} + P_{2022-2024}
    $$
    $$
    P_{\text{total}} \geq 2000, \quad \text{if possible}
    $$

    Since AI research is evolving rapidly, **we place greater emphasis on collecting papers from 2022-2024** to capture **state-of-the-art methodologies and breakthroughs**.


  - **2. API Querying & Pagination**
    Each request retrieves up to **100 papers** (API limit), so pagination is necessary. We use the `offset` parameter to **incrementally fetch results** until the required number of papers is collected or no more are available.  

    - **Rate Limiting Handling**: If the API responds with **429 Too Many Requests**, the script **pauses for 10 seconds** before retrying.  
    - **Error Handling**: Network errors and failed responses are caught and logged, ensuring robust execution.  

  - **3. Constants**
 
      Several constants define the scraping behavior:  

      $$
      \text{RESEARCH\_TOPICS} = \{ \text{"Foundation Models"}, \text{"Generative Models"}
      $$
      $$
        \text{"LLM"}, \text{"VLM"}, \text{"Diffusion Models"} \}
      $$

      $$
      \text{YEAR\_RANGES} = \{ (2017, 2021), (2022, 2024) \}
      $$

      $$
      P_{\text{total}} = 2000 \quad \text{(papers per topic, if available)}
      $$

      $$
      \text{OUTPUT\_DIR} = "scraped\_data"
      $$

      We ensure that if older papers are insufficient, **recent papers (2022-2024) fill the gap**, reinforcing the importance of the latest research.


  - **4. Multithreading for Efficiency**
  
      Fetching data sequentially for multiple topics would be **slow** due to API rate limits and network latency. To overcome this, we use **multithreading**:  

      - Each topic is **assigned a separate thread** to run the `fetch_and_save_with_compensation()` function concurrently.  
      - This allows multiple API requests to run **in parallel**, significantly reducing total execution time.  
      - Threads are synchronized using `thread.join()` to ensure all data is fetched before the script completes.  

      Let **\( T \)** be the number of research topics and **\( N \)** be the number of threads:  
      $$
      T = 5, \quad N = 5
      $$
      $$
      \text{Total Execution Time} \approx \frac{\text{Single Topic Fetch Time}}{N}
      $$

      Since research in AI is **progressing rapidly**, prioritizing the latest papers (2022-2024) ensures our dataset reflects cutting-edge developments.



## Database

In [None]:
import os
import json
import sqlite3

# Constants
DB_FILE = "papers.db"
INPUT_DIR = "scraped_data"

# Function to create the database and table
def create_database():
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS papers (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            abstract TEXT,
            authors TEXT,
            citations INTEGER,
            references_count INTEGER,  
            year INTEGER,
            topic TEXT,
            year_range TEXT
        )
    """)
    
    conn.commit()
    conn.close()
    print("✅ Database and table created successfully.")

# Function to insert data into the database
def insert_data(papers, topic, year_range):
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()

    for paper in papers:
        cursor.execute("""
            INSERT INTO papers (title, abstract, authors, citations, references_count, year, topic, year_range)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            paper.get("title", "No Title"),
            paper.get("abstract", "No Abstract"),
            ", ".join(paper.get("authors", ["Unknown"])),
            paper.get("citations", 0),
            paper.get("references", 0),  # Updated key to match new column name
            paper.get("year", None),
            topic,
            year_range
        ))

    conn.commit()
    conn.close()
    print(f"✅ Inserted {len(papers)} papers into database for topic '{topic}' ({year_range}).")

# Function to read all JSON files and save data to database
def process_json_files():
    if not os.path.exists(INPUT_DIR):
        print(f"❌ Error: Directory '{INPUT_DIR}' not found!")
        return

    for filename in os.listdir(INPUT_DIR):
        if filename.endswith(".json"):
            filepath = os.path.join(INPUT_DIR, filename)
            
            try:
                with open(filepath, "r", encoding="utf-8") as file:
                    data = json.load(file)
                
                if not data:
                    print(f"⚠️ Warning: No data in {filename}")
                    continue
                
                # Extract topic and year range from filename
                parts = filename.replace(".json", "").split("_")
                topic = " ".join(parts[:-1])  # Extracts topic name
                year_range = parts[-1]  # Extracts year range

                # Insert into database
                insert_data(data, topic, year_range)

            except Exception as e:
                print(f"❌ Error processing {filename}: {e}")

# Run the process
if __name__ == "__main__":
    create_database()
    process_json_files()
    print("✅ All JSON files processed successfully.")


✅ Database and table created successfully.
✅ Inserted 1000 papers into database for topic 'Diffusion Models' (2017-2021).
✅ Inserted 1000 papers into database for topic 'Diffusion Models' (2022-2024).
✅ Inserted 1000 papers into database for topic 'Foundation Models' (2017-2021).
✅ Inserted 1000 papers into database for topic 'Foundation Models' (2022-2024).
✅ Inserted 1000 papers into database for topic 'Generative Models' (2017-2021).
✅ Inserted 1000 papers into database for topic 'Generative Models' (2022-2024).
✅ Inserted 957 papers into database for topic 'LLM' (2017-2021).
✅ Inserted 1000 papers into database for topic 'LLM' (2022-2024).
✅ Inserted 391 papers into database for topic 'VLM' (2017-2021).
✅ Inserted 1000 papers into database for topic 'VLM' (2022-2024).
✅ All JSON files processed successfully.
