In [1]:
import pandas as pd

df = pd.read_csv("homepage_data.csv")
df2 = pd.read_csv("homepage_data_3.csv")
df = df[~df['Personal Website'].str.endswith('.sg').astype(bool)]
df = pd.concat([df, df2], ignore_index=True)
df.head()

Unnamed: 0,Name,Personal Website,About Us Text
0,Albert Chi-Shing CHUNG,https://cse.hkust.edu.hk/admin/people/faculty/...,Albert Chung is a Full Professor at the Depart...
1,Andrew Wing-On POON,https://eeawpoon.people.ust.hk/,Professor Poon received his BA degree from the...
2,Antoni Bert CHAN,https://www.cs.cityu.edu.hk/~abchan/,Dr. Antoni Chan is a Professor at the City Uni...
3,Brian Kan-Wing MAK,https://cse.hkust.edu.hk/admin/people/faculty/...,Dr. Mak is currently an Associate Professor in...
4,CHAN Chee Yong,https://sgps.cuhk.edu.hk/profile/professor-cha...,Professor Chee Hon Chan is an Assistant Profes...


In [2]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import requests
from markdownify import markdownify as md
import json
from tqdm import tqdm
from prompts import SYSTEM_PROMPT_ANALYZE_HOMEPAGES
from concurrent.futures import ThreadPoolExecutor

load_dotenv(".env")

models = ["google/gemini-2.0-flash-001"]
model_tags  = ["gemini-flash"]
iterations = 20

client = OpenAI(
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1",
)

def process_homepage_text(text: str):
    all_responses = {}
    with ThreadPoolExecutor() as executor:
        futures = []
        future_to_info = {}
        for iteration in range(iterations):
            for (model_tag, model_name) in zip(model_tags, models):
                future = executor.submit(
                    client.chat.completions.create,
                    messages=[{
                        "role": "system",
                        "content": SYSTEM_PROMPT_ANALYZE_HOMEPAGES
                    }, {
                        "role": "user",
                        "content": text
                    }],
                    response_format={
                        "type": "json_object"
                    },
                    temperature=0.5,
                    model=model_name
                )
                futures.append(future)
                future_to_info[future] = {
                    "model_tag": model_tag,
                    "model_name": model_name,
                    "iteration": iteration
                }

        for future_idx, future in tqdm(enumerate(futures), desc="Processing"):
            response = future.result()
            run_info = future_to_info[future]
            iteration = run_info["iteration"]
            model_tag = run_info["model_tag"]

            try: 
                response = json.loads(response.choices[0].message.content)
                for key in response:
                    all_responses[f"{key}_{model_tag}_{iteration}"] = response[key]
            except:
                print("Error parsing message, leaving elements blank")
    return all_responses


In [3]:
researcher_profile = []

In [None]:
from rich import print

# TODO: append university profile w/ personal website text
# TODO: make sure that results aren't correlated w/ more text (more personable because they wrote more) 


def process_row(row):
    text = row['About Us Text']
    if not isinstance(text, str):
        return {
            "name": row["Name"],
            "error": "No text found"
        }
    if len(text.strip()) == 0:
        return {
            "name": row["Name"],
            "error": "No text found"
        }
    response = process_homepage_text(text)
    response["name"] = row['Name']
    return response

for index, row in tqdm(list(df.iterrows()), total=df.shape[0]):
    researcher_profile.append(process_row(row))

  0%|          | 0/164 [00:00<?, ?it/s]

In [None]:
print(researcher_profile)

In [None]:
with open("homepage_data_analyzed_multiple.json", "w+") as f:
    json.dump(researcher_profile, f)

In [None]:
# TODO: save to 0721-month-analysis-data, save to new file