In [11]:
from IPython.display import Markdown, display
import PyPDF2
import requests
import os
from PyPDF2 import PdfReader
from openai import OpenAI
from dotenv import load_dotenv

In [12]:
def load_api_key():
    load_dotenv()
    api_key = os.getenv("OPENAI_API_KEY")

    # Check the key

    if not api_key:
        print(
            "No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!"
        )
    elif not api_key.startswith("sk-proj-"):
        print(
            "An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook"
        )
    elif api_key.strip() != api_key:
        print(
            "An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook"
        )
    else:
        print("API key found and looks good so far!")
    return api_key

In [13]:
api_key = load_api_key()
openai = OpenAI()

API key found and looks good so far!


In [14]:
BASE_PATH = "../data/crawled_content"
UNIVERSITY_STR = ["uiw_edu", "stmary_edu", "klesse_utsa_edu"][-1]
full_path = f"{BASE_PATH}/{UNIVERSITY_STR}.txt"

In [15]:
# path = "C:\\Users\\alire\\OneDrive\\personal_projects\\mma_ranking\\Notebooks\\crawled_content_stmary_edu_6.txt"
# path = "C:\\Users\\alire\\OneDrive\\personal_projects\\mma_ranking\\Notebooks\\crawled_content_uiw_edu_6.txt"
# path = r"C:\Users\alire\OneDrive\personal_projects\mma_ranking\Notebooks\crawled_content_klesse_utsa_edu_1.txt"
print(f"Attempting to open: {full_path}")

with open(full_path, 'r', encoding='utf-8', errors='ignore') as file:
    content = file.read()    
    # print(content)

content = content.replace("|", "-")

Attempting to open: ../data/crawled_content/klesse_utsa_edu.txt


In [16]:
# OLLAMA_API = "http://localhost:11434/api/chat"
# HEADERS = {"Content-Type": "application/json"}
# MODEL = "llama3.2"


# def chat_with_ollama(prompt):
#     messages = [{"role": "user", "content": prompt}]
#     payload = {"model": MODEL, "messages": messages, "stream": False}
#     response = requests.post(OLLAMA_API, json=payload, headers=HEADERS)
#     return response.json()["message"]["content"]

In [17]:
system_prompt = """
You are a highly skilled information extraction assistant specializing in extracting structured data from unstructured web content. Your goal is to extract accurate and detailed information about university faculty and staff while following these guidelines:

1. Prioritize information from an individual's dedicated profile page. If data exists in multiple sources, trust the dedicated page for accuracy.
2. Use department directories only to supplement missing information or confirm the department name.
3. Ensure no hallucinations; extract data verbatim from the content.
4. Include all possible profiles, even if some fields are incomplete. Leave empty fields blank instead of excluding the record.
5. Extract all publications and courses exactly as listed on the pages, word by word, without summarization or modification.

The extracted data should be structured into the following fields:
- Name (Full name of the individual)
- Title (Academic or professional title)
- Office Location (Room and building details)
- Phone (Contact number)
- Email (Official university email address)
- Major (Field of study or specialization)
- Department (Academic department)
- Research Interests (Explicitly mentioned research areas)
- Teaching Interests (Courses taught or explicitly mentioned teaching areas)
- Publications (Each publication listed verbatim)
- Courses (Each course listed verbatim, including titles, codes, or descriptions)
- Page Link (The URL of the source webpage)

Output the extracted information in a structured table format, ensuring no entries are missed, even if incomplete.
"""

In [18]:
user_prompt = f"""
Extract the information for all faculty and staff mentioned in the given university webpages. Ensure that:

1. Data from an individual's dedicated page is prioritized over department directories for accuracy.
2. Department directories are used only to fill in missing information, such as the department name.
3. All publications and courses are included verbatim, as listed.
4. Incomplete profiles are still included, leaving NA where data is unavailable.

Provide the extracted data in a table format with the following headers:
| Name | Title | Office Location | Phone | Email | Major | Department | Research Interests | Teaching Interests | Publications | Courses | Page Link |

Input content:
{content}

"""

In [19]:
# print(user_prompt)

In [20]:
def chat_with_gpt(message, system_prompt=system_prompt):
    response = openai.chat.completions.create(
        model="gpt-4o-mini", messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": message}
            ]
    )
    return response.choices[0].message.content

In [21]:
verbs_list = chat_with_gpt(user_prompt)
display(Markdown(verbs_list))

| Name                         | Title                                                     | Office Location | Phone | Email | Major | Department                                          | Research Interests | Teaching Interests | Publications | Courses | Page Link                                                     |
|------------------------------|-----------------------------------------------------------|------------------|-------|-------|-------|-----------------------------------------------------|-------------------|-------------------|--------------|---------|--------------------------------------------------------------|
| Andres M. Aguirre-Mesa, Ph.D. | Assistant Professor of Instruction                        |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/aguirre-serrano-paola.html |
| Kiran Bhaganagar, Ph.D.      | Professor, Associate Fellow of AIAA                     |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/bhaganagar-kiran.html |
| Tanveer Hossain Bhuiyan, Ph.D. | Assistant Professor of Research                         |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/bhuiyan-tanveer-hossain.html |
| Mauricio Aristizabal Cano, Ph.D. | Assistant Professor of Research                      |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/ari-stizabal-cano.html |
| Yesh P. Singh, Ph.D., P.E.    | Professor Emeritus                                       |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/singh-hardev.html |
| Carl F. Popelar, Ph.D.        | Assistant Professor of Practice                          |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/polpelar-carl.html |
| Christopher S. Combs, Ph.D.   | Dee Howard Memorial Endowed Faculty Fellowship          |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/combs-christopher.html |
| Harry Millwater, Ph.D.        | Samuel G. Dawson Endowed Professor                      |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/millwater-harry.html |
| David Restrepo, Ph.D.         | Associate Professor                                      |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/restrepo-david.html |
| James D. Walker, Ph.D.      | Adjoint Professor, ASME and AIAA Fellow                |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/walker-james.html |
| Francisco Herbert, Ph.D.     | Klesse Endowed Fellow, Professor of Instruction        |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/herbert-francisco.html |
| Ender Finol, Ph.D.          | Zachry Mechanical Engineering Department Endowed Chair, Professor |                |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/finol-ender.html |
| Kyland S. Martinez Raismus    | Assistant Professor                                      |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/mart-rainus-kyland.html |
| F. Frank Chen, Ph.D.         | Lutcher Brown Distinguished Chair in Advanced Manufacturing |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/chen-f-frank.html |
| Krystel K. Castillo, Ph.D.   | Lutcher Brown Chair in Mechanical Engineering          |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/castillo-krystel.html |
| Yusheng Feng, Ph.D.         | Professor                                                |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/feng-yusheng.html |
| Hai-Chao Han, Ph.D.         | Professor, Fellow of ASME, AHA & AIMBE                  |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/han-hai-chao.html |
| Madhavrao “Rao” Govindaraju, Ph.D. | Professor of Instruction                         |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/govindaraju-madhavrao.html |
| Robert De Lorenzo, MD, MSM, MSCI, FACEP | Adjoint Professor                             |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/de-lorenzo-robert.html |
| Zhi-Gang Feng, Ph.D.        | Assistant Professor                                      |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/feng-zhi-gang.html |
| Ender Finol, Ph.D.          | Assistant Professor                                      |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/finol-ender.html |
| Kiran Bhaganagar, Ph.D.     | Professor                                               |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/bhaganagar-kiran.html |
| Xi’an Jiaotong University, China | Ph.D. (Jointly Trained)                         |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/han-hai-chao.html |
| Sooby, Elizabeth S.         | Associate Professor                                      |                  |       |       |       | Mechanical Engineering                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/sooby-elizabeth.html |
| Heather Longoria            | Project Specialist                                       |                  |       |       |       | Student Success Center                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/longoria-heather.html |
| Cindy Estrella Estrella, M.Ed. | Program Manager                                     |                  |       |       |       | Student Success Center                               |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/estrella-cindy.html |
| Ben Campos                   | Sr. Administrative Manager                              |                  |       |       |       | Klesse College                                        |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/campos-ben.html |
| Cayla Jimenez               | Sr. Program Coordinator                                  |                  |       |       |       | Klesse College                                        |                   |                   |              |         | https://klesse.utsa.edu/faculty/profiles/jimenez-cayla.html |

In [22]:
# verbs_list = chat_with_gpt(user_prompt)
# display(Markdown(verbs_list))

In [44]:
import pandas as pd
import re
# TODO: the functions does not consider empty cells. Should be fixed.
def save_as_csv(text, file_name="output/professors_list.csv"):

    # Extract the header and rows using regular expressions
    lines = text.splitlines()
    table_lines = [line for line in lines if "|" in line]  # Keep only table lines

    # Process the header and rows
    header = [col.strip() for col in table_lines[0].split("|")[1:-1]]  # Extract header
    data = [
        [col.strip() for col in row.split("|")[1:-1]]  # Extract each row
        for row in table_lines[2:]  # Skip separator row
    ]

    # Create a DataFrame
    df = pd.DataFrame(data, columns=header)

    # Save to CSV with UTF-8 BOM
    df.to_csv(file_name, index=False, encoding="utf-8-sig")

    print(f"CSV file saved as {file_name}' with UTF-8 BOM.")

In [45]:
# print(verbs_list)

In [46]:
save_as_csv(verbs_list, file_name=f"../data/professors_list/{UNIVERSITY_STR}.csv")

CSV file saved as ../data/professors_list/uiw_edu.csv' with UTF-8 BOM.
