In [1]:
from IPython.display import Markdown, display
import PyPDF2
import requests
import os
from PyPDF2 import PdfReader
from openai import OpenAI
from dotenv import load_dotenv

In [2]:
def load_api_key():
    load_dotenv()
    api_key = os.getenv("OPENAI_API_KEY")

    # Check the key

    if not api_key:
        print(
            "No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!"
        )
    elif not api_key.startswith("sk-proj-"):
        print(
            "An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook"
        )
    elif api_key.strip() != api_key:
        print(
            "An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook"
        )
    else:
        print("API key found and looks good so far!")
    return api_key

In [3]:
api_key = load_api_key()
openai = OpenAI()

API key found and looks good so far!


In [4]:
# path = "C:\\Users\\alire\\OneDrive\\personal_projects\\mma_ranking\\Notebooks\\crawled_content_stmary_edu_6.txt"
# path = "C:\\Users\\alire\\OneDrive\\personal_projects\\mma_ranking\\Notebooks\\crawled_content_uiw_edu_6.txt"
path = r"C:\Users\alire\OneDrive\personal_projects\mma_ranking\Notebooks\crawled_content_klesse_utsa_edu_1.txt"
print(f"Attempting to open: {path}")

with open(path, 'r', encoding='utf-8', errors='ignore') as file:
    content = file.read()    
    # print(content)

content = content.replace("|", "-")

Attempting to open: C:\Users\alire\OneDrive\personal_projects\mma_ranking\Notebooks\crawled_content_klesse_utsa_edu_1.txt


In [5]:
# OLLAMA_API = "http://localhost:11434/api/chat"
# HEADERS = {"Content-Type": "application/json"}
# MODEL = "llama3.2"


# def chat_with_ollama(prompt):
#     messages = [{"role": "user", "content": prompt}]
#     payload = {"model": MODEL, "messages": messages, "stream": False}
#     response = requests.post(OLLAMA_API, json=payload, headers=HEADERS)
#     return response.json()["message"]["content"]

In [6]:
system_prompt = """
You are a highly skilled information extraction assistant specializing in extracting structured data from unstructured web content. Your goal is to extract accurate and detailed information about university faculty and staff while following these guidelines:

1. Prioritize information from an individual's dedicated profile page. If data exists in multiple sources, trust the dedicated page for accuracy.
2. Use department directories only to supplement missing information or confirm the department name.
3. Ensure no hallucinations; extract data verbatim from the content.
4. Include all possible profiles, even if some fields are incomplete. Leave empty fields blank instead of excluding the record.
5. Extract all publications and courses exactly as listed on the pages, word by word, without summarization or modification.

The extracted data should be structured into the following fields:
- Name (Full name of the individual)
- Title (Academic or professional title)
- Office Location (Room and building details)
- Phone (Contact number)
- Email (Official university email address)
- Major (Field of study or specialization)
- Department (Academic department)
- Research Interests (Explicitly mentioned research areas)
- Teaching Interests (Courses taught or explicitly mentioned teaching areas)
- Publications (Each publication listed verbatim)
- Courses (Each course listed verbatim, including titles, codes, or descriptions)
- Page Link (The URL of the source webpage)

Output the extracted information in a structured table format, ensuring no entries are missed, even if incomplete.
"""

In [7]:
user_prompt = f"""
Extract the information for all faculty and staff mentioned in the given university webpages. Ensure that:

1. Data from an individual's dedicated page is prioritized over department directories for accuracy.
2. Department directories are used only to fill in missing information, such as the department name.
3. All publications and courses are included verbatim, as listed.
4. Incomplete profiles are still included, leaving blank fields where data is unavailable.

Provide the extracted data in a table format with the following headers:
| Name | Title | Office Location | Phone | Email | Major | Department | Research Interests | Teaching Interests | Publications | Courses | Page Link |

Input content:
{content}

"""

In [8]:
# print(user_prompt)

In [9]:
def chat_with_gpt(message, system_prompt=system_prompt):
    response = openai.chat.completions.create(
        model="gpt-4o-mini", messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": message}
            ]
    )
    return response.choices[0].message.content

In [10]:
verbs_list = chat_with_gpt(user_prompt)
display(Markdown(verbs_list))

| Name                           | Title                                                                                                             | Office Location | Phone         | Email                          | Major          | Department               | Research Interests | Teaching Interests | Publications | Courses | Page Link                           |
|--------------------------------|-------------------------------------------------------------------------------------------------------------------|------------------|---------------|--------------------------------|------------------|--------------------------|--------------------|-------------------|--------------|---------|------------------------------------|
| Guillermo Araya, Ph.D.         | Wayne and Julie Fagan Endowed Professor, Associate Professor, Mechanical Engineering                               | AET 2.318        | 210-458-6479  | juan.araya@utsa.edu            |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/araya-guillermo.html |
| Keith M. Axler, Ph.D., PMP     | Professor of Practice, Mechanical Engineering                                                                      | EB 3.04.23       | 210-458-6431  | keith.axler@utsa.edu           |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/axler-keith.html    |
| Kiran Bhaganagar, Ph.D.       | Professor, Associate Fellow of AIAA, Mechanical Engineering                                                        | EB 3.04.16       | 210-458-6496  | kiran.bhaganagar@utsa.edu      |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/bhaganagar-kiran.html |
| Tanveer Hossain Bhuiyan, Ph.D. | Assistant Professor, Mechanical Engineering                                                                          |                  | 210-458-5567  | TanveerHossain.Bhuiyan@utsa.edu |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/bhuiyan-tanveer-hossain.html |
| Krystel K. Castillo, Ph.D.    | Lutcher Brown Chair in Mechanical Engineering, Professor, Director, Texas Sustainable Energy Research Institute (TSERI) | AET 2.303        | 210-458-8746  | krystel.castillo@utsa.edu      |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/castillo-krystel.html |
| F. Frank Chen, Ph.D.          | Lutcher Brown Distinguished Chair in Advanced Manufacturing, Professor, SME Fellow and IISE Fellow                 | EB 3.04.422      | 210-458-5382  | ff.chen@utsa.edu               |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/chen-f-frank.html    |
| Sidney Chocron, Ph.D.         | Adjoint Professor, Mechanical Engineering                                                                           |                  | 210-522-3698  | schocron@swri.edu              |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/chocron-sidney.html  |
| Christopher S. Combs, Ph.D.   | Dee Howard Memorial Endowed Faculty Fellowship in Mechanical Engineering, Graduate Advisor of Record: M.S. Aerospace Engineering |                  | 210-458-8288  | ccombs@utsa.edu                |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/combs-christopher.html |
| Alifer Crom, Ph.D.            | Assistant Professor of Instruction, Mechanical Engineering                                                          | EB 3.04.49       | 210-458-7083  | alifer.crom@utsa.edu           |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/crom-alifer.html     |
| Robert De Lorenzo, MD, MSM, MSCI, FACEP | Adjoint Professor, Mechanical Engineering                                                                     |                  | 210-567-0056  | DeLorenzo@uthscsa.edu          |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/de-lorenzo-robert.html |
| Zhi-Gang Feng, Ph.D.          | Associate Professor, Mechanical Engineering                                                                          | EB 3.04.14       | 210-458-5737  | zhigang.feng@utsa.edu          |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/feng-zhi-gang.html   |
| Cody Gonzalez, Ph.D.          | Assistant Professor, Mechanical Engineering                                                                          | EB 3.04.08       | 210-458-8058  | cody.gonzalez@utsa.edu         |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/gonzalez-cody.html    |
| Madhavrao “Rao” Govindaraju, Ph.D. | Professor of Instruction, Mechanical Engineering                                                                  | EB 3.04.46       | 210-458-8306  | madhavrao.govindaraju@utsa.edu |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/govindaraju-madhavrao.html |
| Hai-Chao Han, Ph.D.           | Professor, Fellow of ASME, AHA & AIMBE, Mechanical Engineering                                                     | EB 3.04.02       | 210-458-8189  | hchan@utsa.edu                 |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/han-hai-chao.html    |
| Francisco Herbert, Ph.D.      | Klesse Endowed Fellow, Professor of Instruction, Undergraduate Advisor of Record, Mechanical Engineering             | AET 2.343        |               | Francisco.Herbert@utsa.edu     |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/herbert-francisco.html |
| R. Lyle Hood, Ph.D.           | Associate Professor, ASME Fellow, Mechanical Engineering                                                           | EB 3.04.612      | 210-458-7909  | robert.hood@utsa.edu           |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/hood-r-lyle.html      |
| Ender Finol, Ph.D.            | Zachry Mechanical Engineering Department Endowed Chair, Professor, Fellow of ASME and AHA, Chair of Mechanical Engineering | EB 3.04.02       | 210-258-4952  | ender.finol@utsa.edu           |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/finol-ender.html      |
| Morteza Seidi, Ph.D., P.E.    | Assistant Professor, Department of Mechanical Engineering                                                           | AET 1.368        |               | morteza.seidi@utsa.edu         |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/seidi-morteza.html     |
| Harry Millwater, Ph.D.        | Samuel G. Dawson Endowed Professor, Associate Chair for Research, Mechanical Engineering                            | AET 2.338        | 210-458-4481  | harry.millwater@utsa.edu       |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/millwater-harry.html   |
| Ashok Nedungadi, Ph.D.        | Professor of Instruction, Mechanical Engineering                                                                      | EB 3.04.06       | 210-458-5591  | ashok.nedungadi@utsa.edu       |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/nedungadi-ashok.html   |
| Daniel I. Pineda, Ph.D.      | Assistant Professor, Mechanical Engineering                                                                          | EB 3.04.12       | 210-458-5511  | daniel.pineda@utsa.edu         |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/pineda-daniel.html     |
| Hardev Singh                  | Assistant Professor of Practice, Mechanical Engineering                                                             |                  | 210-458-6570  | Hardev.Singh@utsa.edu          |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/singh-hardev.html      |
| David Restrepo, Ph.D.         | Associate Professor, Mechanical Engineering                                                                          | EB 3.04.562      | 210-458-7614  | david.restrepo@utsa.edu        |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/restrepo-david.html     |
| Randall D. Manteufel, Ph.D.   | Associate Professor, ASME Fellow, Mechanical Engineering                                                           | EB 3.04.58       | 210-458-5522  | rmanteufel@utsa.edu            |                  | Mechanical Engineering    |                    |                   |              |         | https://klesse.utsa.edu/faculty/profiles/manteufel-randall.html  |

Note: Most fields such as Research Interests, Teaching Interests, Publications, and Courses are left blank as they were not available in the extracted content.

In [11]:
# verbs_list = chat_with_gpt(user_prompt)
# display(Markdown(verbs_list))

In [12]:
import pandas as pd
import re
# TODO: the functions does not consider empty cells. Should be fixed.
def save_as_csv(text, file_name="output/professors_list.csv"):

    # Extract the header and rows using regular expressions
    lines = text.splitlines()
    table_lines = [line for line in lines if "|" in line]  # Keep only table lines

    # Process the header and rows
    header = [col.strip() for col in table_lines[0].split("|")[1:-1]]  # Extract header
    data = [
        [col.strip() for col in row.split("|")[1:-1]]  # Extract each row
        for row in table_lines[2:]  # Skip separator row
    ]

    # Create a DataFrame
    df = pd.DataFrame(data, columns=header)

    # Save to CSV with UTF-8 BOM
    df.to_csv(file_name, index=False, encoding="utf-8-sig")

    print(f"CSV file saved as {file_name}' with UTF-8 BOM.")

In [15]:
# print(verbs_list)

In [14]:
save_as_csv(verbs_list, file_name="output/professors_list_utsa.csv")

CSV file saved as output/professors_list_utsa.csv' with UTF-8 BOM.
