In [None]:
import os
import time
from ddgs import DDGS
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(api_key=os.getenv("API"))

def get_google_scholar_link(prof_name, university, department):
    query = f"google scholar {prof_name} {university} {department}"
    
    # 1. Search for candidates
    candidates = []
    with DDGS() as ddgs:
        results = list(ddgs.text(query, max_results=5))
        for r in results:
            url = r['href']
            # Filter for individual profile pages
            if "scholar.google" in url and "user=" in url:
                candidates.append(url)
    
    if not candidates:
        return "No profile found."

    # 2. Use LLM to pick the best match (handles cases with common names)
    candidates_str = "\n".join(candidates)
    prompt = f"""
    Pick the official Google Scholar profile for:
    Name: {prof_name}
    University: {university}
    Department: {department}

    Candidates:
    {candidates_str}

    Return ONLY the raw URL.
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    
    return response.choices[0].message.content.strip()

# --- Execution ---
prof_name = "Inder Sekhar Yadav"
university = "IIT Kharagpur"
department = "Humanities and Social Sciences"

result_url = get_google_scholar_link(prof_name, university, department)
print(f"Output: {result_url}")

Output: https://scholar.google.com/citations?user=aol7UFwAAAAJ&hl=en


In [34]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_scholar(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return {"error": f"Failed to fetch page, status code: {response.status_code}"}
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 1. Basic Info & Research Fields
    name = soup.find("div", id="gsc_prf_in").text
    interests = [a.text for a in soup.find_all("a", class_="gsc_prf_inta")]
    
    # 2. Stats (Citations, H-index)
    # The stats table usually has: All | Since [Year]
    # Indices: 0=Citations, 1=h-index, 2=i10-index
    stats_table = soup.find_all("td", class_="gsc_rsb_std")
    citations = stats_table[0].text if stats_table else "N/A"
    h_index = stats_table[2].text if stats_table else "N/A"
    
    # 3. Research Paper Topics (Titles)
    papers = []
    paper_rows = soup.find_all("tr", class_="gsc_a_tr")
    for row in paper_rows:
        title = row.find("a", class_="gsc_a_at").text
        papers.append(title)
        
    # Construct JSON data
    data = {
        "name": name,
        "research_fields": interests,
        "total_citations": citations,
        "h_index": h_index,
        "paper_titles": papers
    }
    
    return data

# Usage
url = result_url  # Use the URL obtained from previous code
result = scrape_scholar(url)

# Save to JSON file
with open('scholar_data.json', 'w') as f:
    json.dump(result, f, indent=4)

print("Scraping complete. Data saved to scholar_data.json")

Scraping complete. Data saved to scholar_data.json
