In [16]:
import pandas as pd

df = pd.read_csv("homepage_data.csv")
df2 = pd.read_csv("homepage_data_3.csv")
df = df[~df['Personal Website'].str.endswith('.sg').astype(bool)]
df = pd.concat([df, df2], ignore_index=True)
df.head()

Unnamed: 0,Name,Personal Website,About Us Text
0,Albert Chi-Shing CHUNG,https://cse.hkust.edu.hk/admin/people/faculty/...,Albert Chung is a Full Professor at the Depart...
1,Andrew Wing-On POON,https://eeawpoon.people.ust.hk/,Professor Poon received his BA degree from the...
2,Antoni Bert CHAN,https://www.cs.cityu.edu.hk/~abchan/,Dr. Antoni Chan is a Professor at the City Uni...
3,Brian Kan-Wing MAK,https://cse.hkust.edu.hk/admin/people/faculty/...,Dr. Mak is currently an Associate Professor in...
4,CHAN Chee Yong,https://sgps.cuhk.edu.hk/profile/professor-cha...,Professor Chee Hon Chan is an Assistant Profes...


In [17]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import requests
from markdownify import markdownify as md
import json
from tqdm import tqdm
from prompts import SYSTEM_PROMPT_ANALYZE_HOMEPAGES
from concurrent.futures import ThreadPoolExecutor

load_dotenv(".env")

models = ["openai/gpt-4o-2024-11-20"]
model_tags  = ["gpt-4o"]
iterations = 5

client = OpenAI(
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1",
)

def process_homepage_text(text: str):
    all_responses = {}
    with ThreadPoolExecutor() as executor:
        futures = []
        future_to_info = {}
        for iteration in range(iterations):
            for (model_tag, model_name) in zip(model_tags, models):
                future = executor.submit(
                    client.chat.completions.create,
                    messages=[{
                        "role": "system",
                        "content": SYSTEM_PROMPT_ANALYZE_HOMEPAGES
                    }, {
                        "role": "user",
                        "content": text
                    }],
                    response_format={
                        "type": "json_object"
                    },
                    temperature=0.5,
                    model=model_name
                )
                futures.append(future)
                future_to_info[future] = {
                    "model_tag": model_tag,
                    "model_name": model_name,
                    "iteration": iteration
                }

        for future_idx, future in tqdm(enumerate(futures), desc="Processing"):
            response = future.result()
            run_info = future_to_info[future]
            iteration = run_info["iteration"]
            model_tag = run_info["model_tag"]

            try: 
                response = json.loads(response.choices[0].message.content)
                for key in response:
                    all_responses[f"{key}_{model_tag}_{iteration}"] = response[key]
            except:
                print("Error parsing message, leaving elements blank:", response)
    return all_responses


In [18]:
researcher_profile = []

In [19]:
from rich import print

# TODO: append university profile w/ personal website text
# TODO: make sure that results aren't correlated w/ more text (more personable because they wrote more) 


def process_row(row):
    text = row['About Us Text']
    if not isinstance(text, str):
        return {
            "name": row["Name"],
            "error": "No text found"
        }
    if len(text.strip()) == 0:
        return {
            "name": row["Name"],
            "error": "No text found"
        }
    response = process_homepage_text(text)
    response["name"] = row['Name']
    return response

for index, row in tqdm(list(df.iterrows()), total=df.shape[0]):
    researcher_profile.append(process_row(row))

Processing: 5it [00:17,  3.53s/it]it/s]
Processing: 5it [00:13,  2.73s/it]5, 17.64s/it]
Processing: 5it [00:14,  2.91s/it]0, 15.31s/it]
Processing: 5it [00:12,  2.53s/it]2, 14.98s/it]
Processing: 5it [00:15,  3.11s/it]2, 14.08s/it]
Processing: 5it [00:17,  3.44s/it]4, 14.62s/it]
Processing: 5it [00:15,  3.04s/it]9, 15.50s/it]
Processing: 5it [00:12,  2.59s/it]0, 15.41s/it]
Processing: 5it [00:18,  3.75s/it]1, 11.10s/it]
Processing: 5it [00:24,  4.91s/it]33, 13.08s/it]
Processing: 5it [00:21,  4.25s/it]11, 16.15s/it]
Processing: 5it [00:15,  3.20s/it]31, 17.58s/it]
Processing: 5it [00:19,  3.98s/it]07, 17.13s/it]
Processing: 5it [00:14,  2.83s/it]51, 17.94s/it]
Processing: 5it [00:16,  3.29s/it]51, 16.85s/it]
Processing: 5it [00:16,  3.37s/it]18, 16.75s/it]
Processing: 5it [00:12,  2.49s/it]07, 16.78s/it]
Processing: 5it [00:24,  4.85s/it]42, 15.50s/it]
Processing: 5it [00:16,  3.37s/it]46, 18.11s/it]
Processing: 5it [00:14,  2.89s/it]34, 17.74s/it]
Processing: 5it [00:12,  2.50s/it]57,

Processing: 5it [00:19,  3.89s/it]
 96%|█████████▋| 158/164 [43:54<02:02, 20.49s/it]

Processing: 5it [00:16,  3.35s/it]
Processing: 5it [00:25,  5.05s/it]:36, 19.39s/it]
Processing: 5it [00:17,  3.45s/it]:24, 21.15s/it]
Processing: 5it [00:13,  2.76s/it]:59, 19.99s/it]
Processing: 5it [00:20,  4.10s/it]:36, 18.14s/it]
Processing: 5it [00:17,  3.50s/it]:18, 18.85s/it]
100%|██████████| 164/164 [45:45<00:00, 16.74s/it]


In [20]:
import os
import json

count = 0
while True:
    if os.path.exists(f"homepage_data_analyzed_multiple_{count}.json"):
        count += 1
    else:
        with open(f"homepage_data_analyzed_multiple_{count}.json", "w+") as f:
            json.dump(researcher_profile, f)
        break

In [21]:
# TODO: save to 0721-month-analysis-data, save to new file