In [1]:
import requests
from bs4 import BeautifulSoup
import time
import csv
from datetime import datetime
import os
import re

def parse_citations(url):
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title
        title_element = soup.find('div', id='gsc_oci_title')
        title = title_element.text.strip() if title_element else "Title not found"

        # Extract citation data from bars
        citation_links = soup.find_all('a', class_='gsc_oci_g_a')
        year_count_map = {}
        for link in citation_links:
            href = link.get('href', '')
            match = re.search(r'as_ylo=(\d+)&as_yhi=(\d+)', href)
            if match:
                year = match.group(1)
                count_span = link.find('span', class_='gsc_oci_g_al')
                count = int(count_span.text.strip()) if count_span else 0
                year_count_map[year] = count

        if not year_count_map:
            print("⚠️ No citation data found.")
            return []

        # Fill in zeroes for missing years
        min_year = min(map(int, year_count_map))
        max_year = max(map(int, year_count_map))
        citation_data = {str(y): year_count_map.get(str(y), 0) for y in range(min_year, max_year + 1)}

        # Prepare long-format rows
        rows = []
        for year, count in sorted(citation_data.items()):
            rows.append({'Title': title, 'Year': year, 'Citations': count})

        return rows

    except Exception as e:
        print(f"❌ Error parsing {url}: {e}")
        return []



In [2]:
def parse_all_from_file(input_file='urls.txt', output_dir='.'):
    date_str = datetime.now().strftime('%Y-%m-%d')
    output_file = os.path.join(output_dir, f"citations_{date_str}.csv")

    all_rows = []

    with open(input_file, 'r') as f:
        urls = [line.strip() for line in f if line.strip()]

    for i, url in enumerate(urls, 1):
        print(f"\n➡️ Parsing {i}/{len(urls)}: {url}")
        rows = parse_citations(url)
        all_rows.extend(rows)
        time.sleep(10)

    # Write combined CSV
    if all_rows:
        with open(output_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['Title', 'Year', 'Citations'])
            writer.writeheader()
            writer.writerows(all_rows)
        print(f"\n✅ Saved all results to: {output_file}")
    else:
        print("⚠️ No data to write.")



In [4]:
# Example usage:
if __name__ == "__main__":
    parse_all_from_file()



➡️ Parsing 1/38: google scholar link
❌ Error parsing google scholar link: Invalid URL 'google scholar link': No scheme supplied. Perhaps you meant https://google scholar link?

➡️ Parsing 2/38: https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ZfF1SeUAAAAJ&citation_for_view=ZfF1SeUAAAAJ:7H_MAutzIkAC

➡️ Parsing 3/38: https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ZfF1SeUAAAAJ&cstart=20&pagesize=80&citation_for_view=ZfF1SeUAAAAJ:wKETBy42zhYC

➡️ Parsing 4/38: https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ZfF1SeUAAAAJ&citation_for_view=ZfF1SeUAAAAJ:vDZJ-YLwNdEC

➡️ Parsing 5/38: https://scholar.google.com/citations?view_op=view_citation&hl=en&user=ZfF1SeUAAAAJ&citation_for_view=ZfF1SeUAAAAJ:1taIhTC69MYC

➡️ Parsing 6/38: https://scholar.google.com/citations?view_op=view_citation&hl=en&user=f-rQ8e4AAAAJ&cstart=20&pagesize=80&citation_for_view=f-rQ8e4AAAAJ:f2IySw72cVMC

➡️ Parsing 7/38: https://scholar.google.com/citations