In [1]:
!pip install scholarly

Collecting scholarly
  Downloading scholarly-1.7.11-py3-none-any.whl.metadata (7.4 kB)
Collecting arrow (from scholarly)
  Using cached arrow-1.3.0-py3-none-any.whl.metadata (7.5 kB)
Collecting bibtexparser (from scholarly)
  Downloading bibtexparser-1.4.3.tar.gz (55 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting deprecated (from scholarly)
  Downloading Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting fake-useragent (from scholarly)
  Downloading fake_useragent-2.0.3-py3-none-any.whl.metadata (17 kB)
Collecting free-proxy (from scholarly)
  Downloading free_proxy-1.1.3.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting python-dotenv (from scholarly)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting selenium (from scholarly)
  Downloading selenium-4.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting sphinx-rtd-theme (from scholarly)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metad

In [None]:
from scholarly import ProxyGenerator

# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)

# Now search Google Scholar from behind a proxy
search_query = scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects')
scholarly.pprint(next(search_query))

In [8]:
try:
    # Retrieve and fill the author's profile using the scholar_id.
    author = scholarly.search_author_id('GLnX3MkAAAAJ')
    author = scholarly.fill(author)
except Exception as e:
    print(f"Error retrieving author with ID {'GLnX3MkAAAAJ'}: {e}")

# Get the list of publications from the profile
publications = author.get('publications', [])

def get_year(pub):
    # Attempt to extract the publication year; if not available, use 0.
    try:
        return int(pub['bib'].get('year', 0))
    except Exception:
        return 0

# Sort publications by publication year in descending order (most recent first)
sorted_pubs = sorted(publications, key=get_year, reverse=True)

In [12]:
sorted_pubs[0]

{'container_type': 'Publication',
 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>,
 'bib': {'title': 'Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback',
  'pub_year': '2022',
  'citation': 'arXiv preprint arXiv:2204.05862, 2022'},
 'filled': False,
 'author_pub_id': 'GLnX3MkAAAAJ:9yKSN-GCB0IC',
 'num_citations': 1706,
 'citedby_url': 'https://scholar.google.com/scholar?oi=bibs&hl=en&cites=11199782510491151350,3181880679113735441,4889875407025449859,14427135269327430373,4405664652634518466',
 'cites_id': ['11199782510491151350',
  '3181880679113735441',
  '4889875407025449859',
  '14427135269327430373',
  '4405664652634518466']}

In [None]:
from scholarly import scholarly

def get_first_and_second_authors(scholar_id, max_recent_papers_to_search=5):
    """
    Given a scholar_id, fetch the scholar's profile, then from the most recent
    `max_recent_papers_to_search` publications, extract the first and second authors
    (only if the publication has at least 3 authors). Returns a non-repeating list of names.
    """
    try:
        # Retrieve and fill the author's profile using the scholar_id.
        author = scholarly.search_author_id(scholar_id)
        author = scholarly.fill(author)
    except Exception as e:
        print(f"Error retrieving author with ID {scholar_id}: {e}")
        return []

    # Get the list of publications from the profile
    publications = author.get('publications', [])
    
    def get_year(pub):
        # Attempt to extract the publication year; if not available, use 0.
        try:
            return int(pub['bib'].get('year', 0))
        except Exception:
            return 0

    # Sort publications by publication year in descending order (most recent first)
    sorted_pubs = sorted(publications, key=get_year, reverse=True)
    print(sorted_pubs)
    # Limit to the max_recent_papers_to_search number of publications
    selected_pubs = sorted_pubs[:max_recent_papers_to_search]

    authors_set = set()  # Use a set to avoid duplicates

    for pub in selected_pubs:
        # Ensure the publication has an 'author' field in its bibliography details
        bib = pub.get('bib', {})
        if 'author' in bib:
            # The authors string is expected to be a comma-separated list of names.
            authors_list = [a.strip() for a in bib['author'].split(',')]
            if len(authors_list) >= 3:
                # Extract the first and second authors and add to the set.
                authors_set.add(authors_list[0])
                authors_set.add(authors_list[1])
    
    # Convert the set to a list before returning
    return list(authors_set)


# Example usage:
if __name__ == "__main__":
    # Replace this with a valid Google Scholar ID.
    example_scholar_id = "GLnX3MkAAAAJ"
    max_recent = 5  # Set the number of recent papers to search

    result_authors = get_first_and_second_authors(example_scholar_id, max_recent)
    print("Unique first and second authors from recent papers:")
    for author_name in result_authors:
        print(author_name)


{'title': 'Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback', 'pub_year': '2022', 'citation': 'arXiv preprint arXiv:2204.05862, 2022'}
{'title': 'In-context Learning and Induction Heads', 'pub_year': '2022', 'citation': 'Transformer Circuits Thread, 2022'}
{'title': 'A Mathematical Framework for Transformer Circuits', 'pub_year': '2021', 'citation': 'Transformer Circuits Thread, 2021'}
{'title': 'Progress Measures For Grokking Via Mechanistic Interpretability', 'pub_year': '2023', 'citation': 'ICLR 2023 Spotlight, 2023'}
{'title': 'Predictability and surprise in large generative models', 'pub_year': '2022', 'citation': 'Proceedings of the 2022 ACM Conference on Fairness, Accountability, and …, 2022'}
Unique first and second authors from recent papers:


In [3]:
import csv

scholar_names = ['Javier Ferrando', 
                #  'Oscar Obeso', 
                #  'Senthooran Rajamanoharan'
                 ]

list_field_to_match = [
    'Machine Learning',
    'Deep Learning',
    'Natural Language Processing',' Interpretability','Mechanistic Interpretability'
]

# -----------------------------
# Helper Functions
# -----------------------------
def matches_field(interests, fields):
    """
    Check if any of the research interests contains one of the fields (case-insensitive).
    """
    for field in fields:
        for interest in interests:
            if field.lower() in interest.lower():
                return True
    return False

def get_matching_scholar_ids(scholar_names, list_field_to_match):
    """
    For each scholar name, search Google Scholar, fill in details,
    and if any research interest matches one of the fields, return the scholar_id.
    """
    matching_ids = []
    
    for name in scholar_names:
        print(f"Searching for '{name}' ...")
        search_query = scholarly.search_author(name)
        for author in search_query:
            try:
                # Fill in the full profile details for this candidate
                author_filled = scholarly.fill(author)
            except Exception as e:
                print(f"  Error fetching details for '{name}': {e}")
                continue

            interests = author_filled.get('interests', [])
            if interests and matches_field(interests, list_field_to_match):
                scholar_id = author_filled.get('scholar_id')
                if scholar_id:
                    matching_ids.append(scholar_id)
                    print(f"  Match found: {author_filled.get('name', 'Unknown')} (ID: {scholar_id})")
        print()  # Blank line for readability between names
    
    return matching_ids

def get_recent_papers(author, num_papers=3):
    """
    Extract the titles of the most recent `num_papers` from the author's publications.
    Publications are sorted by publication year (if available).
    """
    publications = author.get('publications', [])
    
    # Helper to extract year; if missing or invalid, use 0 so they sort last.
    def get_year(pub):
        try:
            return int(pub['bib'].get('year', 0))
        except Exception:
            return 0

    # Sort publications by year (most recent first)
    sorted_pubs = sorted(publications, key=get_year, reverse=True)
    # Extract the titles of the top `num_papers` publications
    recent_papers = [pub['bib'].get('title', 'N/A') for pub in sorted_pubs[:num_papers]]
    return recent_papers

def create_csv(scholar_ids, csv_filename="scholars.csv"):
    """
    For each scholar_id, fetch the full profile details and extract:
      - Name
      - Citation count
      - Affiliation
      - Most recent 3 papers (titles)
    Then sort all records by citation count (ascending) and write them to a CSV file.
    """
    data = []
    
    for scholar_id in scholar_ids:
        print(f"Fetching details for scholar_id {scholar_id} ...")
        try:
            # Retrieve the author's profile by their scholar_id
            author = scholarly.search_author_id(scholar_id)
            author = scholarly.fill(author)
        except Exception as e:
            print(f"Error retrieving author with id {scholar_id}: {e}")
            continue
        
        name = author.get('name', 'N/A')
        affiliation = author.get('affiliation', 'N/A')
        citations = author.get('citedby', 0)
        recent_papers = get_recent_papers(author)
        # Join the recent papers into a single string (separated by semicolons)
        recent_papers_str = "; ".join(recent_papers)
        
        data.append({
            "Name": name,
            "Citations": citations,
            "Affiliation": affiliation,
            "Most Recent 3 Papers": recent_papers_str
        })
    
    # Sort the data by citation count (ascending: lowest to highest)
    data_sorted = sorted(data, key=lambda x: x["Citations"])
    
    # Write the sorted data to a CSV file
    fieldnames = ["Name", "Citations", "Affiliation", "Most Recent 3 Papers"]
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data_sorted:
            writer.writerow(row)
    
    print(f"\nCSV file created: {csv_filename}")

# -----------------------------
# Main Execution
# -----------------------------
if __name__ == "__main__":
    # Step 1: Get matching scholar IDs based on names and research interests
    matching_ids = get_matching_scholar_ids(scholar_names, list_field_to_match)
    print("Matching Scholar IDs:", matching_ids)
    
    # Step 2: Fetch full profiles, sort by citation count, and write to CSV
    create_csv(matching_ids)


Searching for 'Javier Ferrando' ...
  Match found: Javier Ferrando (ID: ZNsw8ZUAAAAJ)

Matching Scholar IDs: ['ZNsw8ZUAAAAJ']
Fetching details for scholar_id ZNsw8ZUAAAAJ ...

CSV file created: scholars.csv
