In [54]:
import pandas as pd
df = pd.read_csv("data/0510-citation-pub-data.csv")

  df = pd.read_csv("data/0510-citation-pub-data.csv")


In [55]:
set(df['School'].tolist())

{'CUHK', 'CityU', 'HKU', 'HKUST', 'NTU', 'NUS', 'SMU'}

In [74]:
df = df[df['School'].isin(['CityU', 'CUHK', 'HKU', 'HKUST'])]

In [75]:
print(df.shape)

(62594, 117)


In [None]:
# friendliness (camaraderie)
# we vs. i
# risk-averse (I don't want trouble, focused on advancing own career, personal gain)

In [26]:
from scholarly import scholarly

# use the Google Scholar page for a researcher to figure out which year this person first published
def get_first_publication_year(name: str):
    search_query = scholarly.search_author(name)
    author = next(search_query, None)

    if author:
        author = scholarly.fill(author)
        publications = author.get('publications', [])
        if publications:
            first_pub_year = min(pub.get('bib', {}).get('pub_year', float('inf')) for pub in publications)
            return first_pub_year if first_pub_year != float('inf') else None
    return None

In [44]:
from dotenv import load_dotenv
import os
from openai import Client
import json

load_dotenv(".env")

client = Client(
    api_key=os.getenv("OPENAI_API_KEY"),
)

def chatgpt_get_univ_name_from_text(text: str):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "For this webpage, provide the name of the University that this individual likely attended for their undergraduate studies in English in a JSON format: {'undergrad_university': 'University Name'}"},
            {"role": "user", "content": text},
        ],
        response_format={"type": "json_object"}
    )
    json_obj = json.loads(response.choices[0].message.content)
    return json_obj["undergrad_university"]

In [81]:
import requests
from markdownify import markdownify as md
import signal

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

def get_university_from_name_google_scholar(name, timeout=10):
    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(timeout)
    
    try:
        search_results = scholarly.search_author(name)
        author = next(search_results, None)
        
        if author:
            author_details = scholarly.fill(author)
            homepage_url = author_details.get("homepage", None)
            if not(homepage_url):
                return "NA: Homepage URL not found"

            response = requests.get(homepage_url)
            if response.status_code != 200:
                return f"NA: Homepage {homepage_url} returned error status code"
            html = response.text
            markdown_text = md(html)
            return chatgpt_get_univ_name_from_text(markdown_text)
        else:
            return "NA: Author not found"
    except TimeoutException:
        return "NA: timeout"
    except Exception as e:
        print(e)
        return "NA: exception"
    finally:
        signal.alarm(0)

In [82]:
import requests
from bs4 import BeautifulSoup

def get_university_from_name_ieee(name):
    # Get biography from ieee explore
    url = f"https://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText={name}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the first result link
    first_result = soup.find('a', class_='result-item-title')
    if first_result:
        bio_url = "https://ieeexplore.ieee.org" + first_result['href']
        bio_response = requests.get(bio_url)
        bio_soup = BeautifulSoup(bio_response.text, 'html.parser')
        
        # Extract the biography section
        bio_section = bio_soup.find('div', class_='author-info-container')
        if bio_section:
            bio_text = bio_section.get_text(separator=' ', strip=True)
            return chatgpt_get_univ_name_from_text(bio_text)
        else:
            return "NA: Biography section not found"
    else:
        return "NA: No results found"

In [83]:
def get_university_from_name(name):
    univ = get_university_from_name_google_scholar(name)
    if univ.startswith("NA"):
        univ = get_university_from_name_ieee(name)
    return univ

In [84]:
all_authors = set([])

for authors in df['Author List'].tolist():
    # print(authors)
    if not authors:
        continue
    if not isinstance(authors, str):
        continue
    authors = authors.replace("[", "").replace("]", "").replace("'", "").split(",")

    all_authors.add(authors[0].strip()) # Usually the advised PhD student

print(list(all_authors)[:10])
print(len(all_authors))
# Main idea here is that the first/second authors are the advised PhD students.
    

['杨景峰， 陈松林， 徐亘博， 苏鹏志， 田永胜， 翟介明， 李波', 'Robert Sabo', 'Ke Zhang', 'Ran Han', 'WR Fahrner', 'Huaisheng Wang', 'Yuanmei Wang', 'Qixiang Sun', 'Aiwei Liu', 'Kei M Lau']
16205


In [85]:
subset_authors = list(all_authors)[:50]
for author in subset_authors:
    print(author, get_university_from_name(author))

杨景峰， 陈松林， 徐亘博， 苏鹏志， 田永胜， 翟介明， 李波 NA: No results found
Robert Sabo NA: No results found
Ke Zhang NA: No results found
Ran Han NA: No results found
WR Fahrner NA: No results found
Huaisheng Wang NA: No results found
Yuanmei Wang NA: No results found
Qixiang Sun NA: No results found
Aiwei Liu Nanjing University
HTTPSConnectionPool(host='www.ece.ust.hk', port=443): Max retries exceeded with url: /eekmlau (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))
Kei M Lau NA: No results found
Philip CH Chan NA: No results found
Zhengxiang Zhou NA: No results found
Kai Zhang Beijing Normal University
Yuting Liang Utrecht University
X HU NA: No results found
HTTPConnectionPool(host='www.panoskalnis.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x31e0f6f30>: Failed to resolve 'www.panoskalnis.com' ([Errno 8] 