In [115]:
!python3 -m pip install --upgrade pip
!python3 -m pip install openai
!python3 -m pip install requests
!python3 -m pip install beautifulsoup4
!clear

[H[2J

In [None]:
from openai import OpenAI
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from bs4 import BeautifulSoup, Comment
import re

client = OpenAI(
    api_key="",
    base_url="https://api.x.ai/v1/",
)

df = pd.read_csv("most_recent_total_detail_profiles_with_homepage.csv")
print(len(df))

40662


In [126]:
def return_text(soup):
    NOISE_ID_RE = re.compile(
        r"^(actionbar|jp-|jetpack|wpgroho|grofiles|wp-emoji|comment-|bilmur|follow-bubble)",
        re.I,
    )

    GENERIC_TAGS = [
        "div", "section", "article", "main",
        "header", "footer", "nav", "span", "p"
    ]

    # Common "noisy" attributes added by frameworks/trackers/renderers
    DROP_ATTR_PREFIXES = ("data-", "aria-")
    DROP_ATTR_NAMES = {
        "role", "rel", "target", "tabindex",
        # "itemprop", "itemscope", "itemtype",
        "contenteditable", "draggable", "spellcheck",
        "loading", "decoding", "fetchpriority",
        "srcset", "sizes", "integrity", "crossorigin",
        "referrerpolicy", "nonce",
        "width", "height",
    }
    ON_EVENT_ATTR_RE = re.compile(r"^on[a-z]+$", re.I)

    def strip_noisy_nodes_and_attrs(root):
        # Remove comments
        for c in root.find_all(string=lambda s: isinstance(s, Comment)):
            c.extract()

        # Remove whole noisy tags
        for t in root.find_all(["script", "style", "noscript", "template", "svg", "canvas"]):
            t.decompose()

        # Remove noisy attributes
        for t in root.find_all(True):
            attrs = list(t.attrs.keys())
            for k in attrs:
                kl = k.lower()

                # Drop inline event handlers like onclick/onload/...
                if ON_EVENT_ATTR_RE.match(kl):
                    t.attrs.pop(k, None)
                    continue

                # Drop data-*, aria-*
                if any(kl.startswith(p) for p in DROP_ATTR_PREFIXES):
                    t.attrs.pop(k, None)
                    continue

                # Drop known noisy attributes
                if kl in DROP_ATTR_NAMES:
                    t.attrs.pop(k, None)
                    continue

    def remove_class_style_regex(html: str) -> str:
        # Remove class/style in both quote styles
        html = re.sub(r"\sclass=\"[^\"]*\"", "", html)
        html = re.sub(r"\sclass='[^']*'", "", html)
        html = re.sub(r"\sstyle=\"[^\"]*\"", "", html)
        html = re.sub(r"\sstyle='[^']*'", "", html)
        return html

    def shrink_html_tokens(html) -> str:
        # First pass: remove noisy nodes/attrs (framework/tracker bloat)
        strip_noisy_nodes_and_attrs(html)

        # Second pass: remove Jetpack/WordPress-ish noisy blocks by id/class pattern
        for t in list(html.find_all(True)):
            if t is None:
                continue

            tid = t.get("id") or ""
            if tid and NOISE_ID_RE.match(tid):
                t.decompose()
                continue

            classes = " ".join(t.get("class", [])) if t.get("class") else ""
            if classes and NOISE_ID_RE.search(classes):
                t.decompose()
                continue

        s = str(html)

        # Normalize whitespace
        s = s.replace("\t", "")
        s = re.sub(r"\n+", "\n", s)
        s = re.sub(r"[ ]{2,}", " ", s)

        # Collapse generic tags
        for tag in GENERIC_TAGS:
            s = re.sub(fr"<{tag}(\s[^>]*)?>", "<>", s)
            s = re.sub(fr"</{tag}>", "</>", s)

        # Collapse empty generic pairs
        s = re.sub(r"<>\s*</>", "<>", s)

        # Remove inter-tag whitespace
        s = re.sub(r">\s+<", "><", s)

        return s.strip()

    # Usage
    body = soup.find("body")
    sp = shrink_html_tokens(body)
    sp = remove_class_style_regex(sp)
    
    return sp

prompt = """
You are an information extraction assistant.

Below is a raw HTML document of a person's profile website.
Your task is to extract the following attributes **only if they are explicitly present in the HTML**.
Do NOT infer, guess, or hallucinate any information.

If an attribute cannot be found with reasonable certainty, return `None` for that field.

### Attributes to extract
- email: A contact email address of the owner found in the HTML (e.g., mailto links or visible text). only the owner's email address, not other people.
- related_links: a list of links that appear to be related to the exact person. (e.g. linkedin, github, cv, blog, sns, etc.). Do not include every single paper, citation, project, labs or company links.
- bio: A short biography or description text describing the person, company, or project. Write it in the first person. At leat 3 sentences, at most 6 sentences.
e.g. I am the researcher who is interested in ...
- page_type: If current html is not a profile page, return the type of the page. e.g. "company", "blog", "other", "labs", etc.
If is not a profile page, return None for all the other attributes.
- company_experiences: A list of company experiences of the owner. include the company name, title(Role), start date, end date.
ex) 
"company_experiences": [
    {
      "company_name": "Company A",
      "title": "Reseach Scientist, TTS end LLM optimization",
      "start_date": "2020-07",
      "end_date": "2021-04"
    },{
      "company_name": "Los University",
      "title": "Assistant professor",
      "start_date": "2019-10",
      "end_date": "2018-02"
    }, etc
]
- education: A list of education experiences of the owner. include the school name, degree, start date, end date. only include BS, MS, PhD. no 
ex)
"education": [
    {
      "school_name": "School A",
      "degree": "Ph.D. in Computer Science",
      "start_date": "2018-11",
      "end_date": "2022-01"
    }
]

Write date in format "YYYY-MM"

### Output format
Return a **valid JSON object** exactly in the following format:

{
  "email": string | None,
  "related_links": list[string] | None,
  "bio": string | None,
  "page_type": string,
  "company_experiences": list[dict] | None,
  "education": list[dict] | None,
}

### Rules
- Use `None` (not null, not empty string) if the value is missing
- For `related_links`, return `None` if no relevant links are found
- Do not include duplicate links
- Do not include navigation-only or irrelevant links (e.g., privacy policy, terms)
- Preserve the original text as-is (do not paraphrase)
- Do not add any explanations or extra text outside the JSON
- Only include in related_links of a list of links that appear to be related to the exact person. (e.g. linkedin, github, cv, blog, sns, etc.). Do not include every single paper, citation, project, labs or company links.

""" 

def calc_cost_with_cache(
    usage,
    input_price_per_1k,
    output_price_per_1k,
    input_price_per_cached_1k,
):
    # prompt
    total_prompt = usage.prompt_tokens
    cached = usage.prompt_tokens_details.cached_tokens or 0
    billable_prompt = max(total_prompt - cached, 0)

    # completion
    completion = usage.completion_tokens

    input_cost = billable_prompt / 1000 * input_price_per_1k + cached / 1000 * input_price_per_cached_1k
    output_cost = completion / 1000 * output_price_per_1k

    return {
        "prompt_tokens_total": total_prompt,
        "prompt_tokens_cached": cached,
        "prompt_tokens_billable": billable_prompt,
        "completion_tokens": completion,
        "input_cost_usd": input_cost,
        "output_cost_usd": output_cost,
        "total_cost_usd": input_cost + output_cost,
    }


In [None]:
from tqdm import tqdm
        
PRICES = {
    "input": 0.0002,
    "output": 0.0005,
    "cached": 0.00005,
}

total_dollars = 0
for i in tqdm(range(10, 30)):
    data = df.iloc[i]
    hl = data['home_link']

    if not pd.isna(hl):
        url = hl
        print(data)

        try:
            if "linkedin.com" in url:
                output = {
                    "email": '',
                    "related_links": [url],
                    "bio": "",
                    "page_type": "",
                    "company_experiences": [],
                    "education": []
                }
                print("Put it in linkedin.")
                continue
            resp = requests.get(
                url,
                headers={"User-Agent": "Mozilla/5.0"},
                timeout=30,
            )
            resp.raise_for_status()

            soup = BeautifulSoup(resp.text, "html.parser")
            sp = return_text(soup)
        
        except Exception as e:
            print("\n\nFailed to get page info\n\n")
            continue

        prompt = prompt + f"""
### owner's name
{data['name']}

### HTML Document
{sp}
"""

        response = client.chat.completions.create(
            model="grok-4-1-fast-non-reasoning",
            messages=[
                {"role": "system", "content": "You are a html extractor"},
                {"role": "user", "content": prompt}
            ],
        )
        output = response.choices[0].message.content

        cost = calc_cost_with_cache(response.usage, PRICES["input"], PRICES["output"], PRICES["cached"])
        dollars = cost['total_cost_usd']
        total_dollars += dollars


  0%|          | 0/20 [00:00<?, ?it/s]

nan
nan
http://junsukim.blogspot.kr/
author_id                                                         iPSHyTYAAAAJ
name                                                                 Junsu Kim
affiliations                                      Korea Polytechnic University
email                                    Verified email at ieee.org - Homepage
interests                                        ['Radio resource management']
image_thumbnail              https://scholar.google.com/citations/images/av...
articles                     [{'title': 'Mimicking full-duplex relaying usi...
total_citation_count                                                     751.0
since_2020_citation_count                                                147.0
h_index                                                                   17.0
home_link                                         http://junsukim.blogspot.kr/
Name: 12, dtype: object


 15%|█▌        | 3/20 [00:09<00:52,  3.10s/it]

https://hyeonjeong1.notion.site/Hyeonjeong-Shin-36a8827d5e804c388ce0be8afb9fd182
author_id                                                         GwF6FmQAAAAJ
name                                                           Hyeonjeong Shin
affiliations                                               M.S. Student, KAIST
email                                 Verified email at kaist.ac.kr - Homepage
interests                                                      ['Data Mining']
image_thumbnail              https://scholar.google.comhttps://scholar.goog...
articles                     [{'title': 'Weather4cast at neurips 2022: Supe...
total_citation_count                                                      31.0
since_2020_citation_count                                                 31.0
h_index                                                                    3.0
home_link                    https://hyeonjeong1.notion.site/Hyeonjeong-Shi...
Name: 13, dtype: object


 20%|██        | 4/20 [00:15<01:05,  4.07s/it]

https://ntunedl.weebly.com/
author_id                                                         mjwO4sUAAAAJ
name                                                                 Munho Kim
affiliations                 Associate Professor (Tenured) of EEE at Nanyan...
email                                  Verified email at ntu.edu.sg - Homepage
interests                    ['Wide bandgap semiconductor', 'Electronics/Op...
image_thumbnail              https://scholar.google.comhttps://scholar.goog...
articles                     [{'title': 'High-performance green flexible el...
total_citation_count                                                    3687.0
since_2020_citation_count                                               2928.0
h_index                                                                   28.0
home_link                                          https://ntunedl.weebly.com/
Name: 14, dtype: object


 25%|██▌       | 5/20 [00:31<01:56,  7.79s/it]

nan
https://www.linkedin.com/in/park-min-bae-b97543262/
author_id                                                         72iRjT8AAAAJ
name                                                               Minbae Park
affiliations                                                Hanyang university
email                               Verified email at hanyang.ac.kr - Homepage
interests                             ['RAG', 'LLM', 'GNN', 'Link prediction']
image_thumbnail              https://scholar.google.com/citations/images/av...
articles                     [{'title': 'ProgRAG: Hallucination-Resistant P...
total_citation_count                                                       NaN
since_2020_citation_count                                                  NaN
h_index                                                                    NaN
home_link                    https://www.linkedin.com/in/park-min-bae-b9754...
Name: 16, dtype: object
nan
nan
nan
nan
https://jkume0107.wordpress.com/
au

100%|██████████| 20/20 [00:31<00:00,  1.57s/it]

nan
nan
nan
nan
nan
https://www.linkedin.com/in/haewookjang/
author_id                                                         XHSGWDMAAAAJ
name                                                              Jang Haewook
affiliations                                                    Meteor Biotech
email                                   Verified email at snu.ac.kr - Homepage
interests                            ['Bioengineering', 'AI', 'Spatial-omics']
image_thumbnail              https://scholar.google.comhttps://scholar.goog...
articles                     [{'title': 'Blood culture-free ultra-rapid ant...
total_citation_count                                                      93.0
since_2020_citation_count                                                 92.0
h_index                                                                    4.0
home_link                             https://www.linkedin.com/in/haewookjang/
Name: 27, dtype: object
https://www.linkedin.com/in/jin-s-heo-5043845b




In [63]:
import pandas as pd

df = pd.read_csv('test_with_one.csv')
print(len(df))
df2 = pd.read_csv('test_with_two.csv')
print(len(df2))

merged_df = pd.concat([df, df2])
print(len(merged_df))

merged_df.to_csv('only_home_links.csv', index=False)

21993
19498
41491
