In [1]:
import openai, json, os, pandas as pd
from WebScraping import WebScraping
from User import User
from pprint import pprint
from dotenv import load_dotenv
import Universities

def connectOpenAI():
    load_dotenv()
    openai.api_key = os.getenv("API_KEY")
connectOpenAI()


In [2]:
def GetDataFromGPT(page):
    connectOpenAI()
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": os.getenv("prompt")},
                  {"role": "user", "content": page}],
        temperature=0
    )

    data = response.choices[0].message.content
    return data



In [3]:
#Create the WebScraping class
ws = WebScraping()

In [3]:
#Create the Test Researchers
zheng = User("Zheng Xiang", "Virginia Tech")
sean = User("Sean Agnew", "University of Virginia")

ws.initial_search(zheng)
ws.initial_search(sean)

#Scrape links
w1 = ws.scrape_webpage(zheng.initial_search_links[0], zheng)
w2 = ws.scrape_webpage(sean.initial_search_links[0], sean)


True 3 Instituion found | Researcher name found | University website verified
True 3 Instituion found | Researcher name found | University website verified


In [4]:
GetDataFromGPT(w1)

'{\n  "name": "Zheng (Phil) Xiang",\n  "institution": "Virginia Polytechnic Institute and State University",\n  "research_fields": [\n    "Travel behavior",\n    "e-Tourism",\n    "Information technology",\n    "Tourism analytics"\n  ],\n  "research_focus": "Travel information search, social media marketing, and business analytics for the tourism and hospitality industries",\n  "expertise": "N/A",\n  "emails": [\n    "philxz@vt.edu"\n  ],\n  "appointments": [\n    {\n      "position": "Department Head",\n      "department": "Hospitality and Tourism Management",\n      "institution": "Virginia Tech"\n    },\n    {\n      "position": "Associate Professor",\n      "department": "Hospitality and Tourism Management",\n      "institution": "Virginia Tech"\n    }\n  ],\n  "awards": [\n    "Emerging Scholar of Distinction award by the International Academy for the Study of Tourism",\n    "Best Research Paper of the Year Award at the ICHRIE Conference in New Orleans, LA in July 2019",\n    "Cer

In [5]:
results1 = json.loads(GetDataFromGPT(w1))
results2 = json.loads(GetDataFromGPT(w2))

In [6]:
results1

{'name': 'Zheng (Phil) Xiang',
 'institution': 'Virginia Polytechnic Institute and State University',
 'research_fields': ['Travel behavior',
  'e-Tourism',
  'Information technology',
  'Tourism analytics'],
 'research_focus': 'Travel information search, social media marketing, and business analytics for the tourism and hospitality industries',
 'expertise': 'N/A',
 'emails': ['philxz@vt.edu'],
 'appointments': [{'position': 'Department Head',
   'department': 'Hospitality and Tourism Management',
   'institution': 'Virginia Tech'},
  {'position': 'Associate Professor',
   'department': 'Hospitality and Tourism Management',
   'institution': 'Virginia Tech'}],
 'awards': ['Emerging Scholar of Distinction award by the International Academy for the Study of Tourism',
  'Best Research Paper of the Year Award at the ICHRIE Conference in New Orleans, LA in July 2019',
  'Certificate of Teaching Award for the department by Pamplin College of Business, 2017-2018',
  'Best Paper Award (1st Pl

In [8]:
results2

{'name': 'Sean R. Agnew',
 'institution': 'University of Virginia School of Engineering and Applied Science',
 'research_fields': 'Materials Science and Engineering',
 'research_focus': 'Metals analysis, including magnesium alloy formability, intermetallic behaviors, and aluminum alloy fatigue',
 'expertise': 'Surface and Interface Science and Engineering, Metallurgy, Materials Characterization, Nanomaterials and nanomanufacturing, Advanced materials for transportation applications',
 'emails': 'sra4p@virginia.edu',
 'appointments': 'William G. Reynolds Professor of Materials Science, Professor of Materials Science and Engineering',
 'awards': 'N/A',
 'gender': 'Male',
 'domain': 'Academia',
 'personal_website': 'N/A'}

In [9]:
df = pd.DataFrame([results1, results2])
df
# df.to_csv('test.csv', index=False)


Unnamed: 0,name,institution,research_fields,research_focus,expertise,emails,appointments,awards,gender,domain,personal_website
0,Zheng (Phil) Xiang,Virginia Polytechnic Institute and State Unive...,"[Travel behavior, e-Tourism, Information techn...","Travel information search, social media market...",,[philxz@vt.edu],"[{'position': 'Department Head', 'department':...",[Emerging Scholar of Distinction award by the ...,Male,Academia,
1,Sean R. Agnew,University of Virginia School of Engineering a...,Materials Science and Engineering,"Metals analysis, including magnesium alloy for...","Surface and Interface Science and Engineering,...",sra4p@virginia.edu,William G. Reynolds Professor of Materials Sci...,,Male,Academia,


In [4]:
# Testing accuracy on web scrape data given to us on 3/13
#alexellery = User("Alex Ellery", "Carleton College")
zheyuz = User("Zheyu Zhang", "Clemson University")
yurygogotsi = User("Yury Gogotsi", "Drexel University")
douglasb = User("Douglas A. Bristow", "Missouri University of Science and Technology")

#ws.initial_search(alexellery)
ws.initial_search(zheyuz)
ws.initial_search(yurygogotsi)
ws.initial_search(douglasb)

#w1 = ws.scrape_webpage(alexellery.initial_search_links[0], alexellery)
w2 = ws.scrape_webpage(zheyuz.initial_search_links[0], zheyuz)
w3 = ws.scrape_webpage(yurygogotsi.initial_search_links[0], yurygogotsi)
w4 = ws.scrape_webpage(douglasb.initial_search_links[0], douglasb)


True 3 Instituion found | Researcher name found | University website verified
True 3 Instituion found | Researcher name found | University website verified
True 3 Instituion found | Researcher name found | University website verified


In [6]:
GetDataFromGPT(w2)
#GetDataFromGPT(w3)
GetDataFromGPT(w4)

'{\n    "name": "Yury Gogotsi",\n    "institution": "Drexel Engineering",\n    "research fields": "Materials Science and Engineering, Nanomaterials",\n    "research focus": "Synthesis and surface modification of inorganic nanomaterials, energy-related and other applications of materials",\n    "expertise": "N/A",\n    "emails": "gogotsi@drexel.edu",\n    "appointments": "Distinguished University and Charles T. and Ruth M. Bach Professor, Director of A.J. Drexel Nanomaterials Institute",\n    "awards": [\n        "2021 MRS Medal, Materials Research Society",\n        "2021 Materials Today Innovation Award, Elsevier",\n        "2021 Horizon Prize, Royal Society of Chemistry (RSC)",\n        "2021 Honorary Doctorate, Sumy State University, Ukraine",\n        "2021 American Chemical Society (ACS) Award in the Chemistry of Materials",\n        "2020 World Academy of Ceramics International Ceramics Prize",\n        "'

In [7]:
# results1 = json.loads(GetDataFromGPT(w1))
results2 = json.loads(GetDataFromGPT(w2))
#results3 = json.loads(GetDataFromGPT(w3))
results4 = json.loads(GetDataFromGPT(w4))

In [9]:
df = pd.DataFrame([results2, results4])
df

Unnamed: 0,name,institution,research fields,research focus,expertise,emails,appointments,awards,gender,domain,personal website,research_fields,research_focus,personal_website
0,Zheyu Zhang,"Clemson University, South Carolina",Electrical and Computer Engineering,"Power Electronics for electric propulsion, ele...","Wide band-gap based power electronics, modular...",zheyuz@clemson.edu,Warren H. Owen - Duke Energy Assistant Profess...,Two prize paper awards from the IEEE Industry ...,Unknown,Academia,,,,
1,Douglas Bristow,Missouri S&T,,,,dbristow@mst.edu,[{'position': 'Associate Professor of Mechanic...,[{'award': 'Teachers Ranked as Excellent by St...,male,Academia,,Mechanical and Aerospace Engineering,Dynamic modeling and control of micro- and nan...,
