In [12]:
from WebScraping import WebScraping
from User import User
from GPT import GPT
ws = WebScraping()

In [13]:
chris = User("Chris Williams", "Virginia Tech")
ws.initial_search(chris)
chris.initial_search_links = chris.initial_search_links[:4]
chris.initial_search_links

['https://vtx.vt.edu/articles/2022/02/eng-williams-nai-senior.html',
 'https://dreams.mii.vt.edu/people/christopher-williams.html',
 'https://objexunlimited.com/portfolio-item/dr-chris-williams-ph-d-and-the-virginia-tech-dreamslab/',
 'https://www.researchgate.net/profile/Christopher-Williams-64']

## **.scrape_researcher()** scrapes all websites in the .initial_search_links

In [15]:
await ws.scrape_researcher(chris, 2)

Scraping researcher method
PAGE SCRAPED:PAGE SCRAPED: https://dreams.mii.vt.edu/people/christopher-williams.html
 https://vtx.vt.edu/articles/2022/02/eng-williams-nai-senior.html
PAGE SCRAPED: https://objexunlimited.com/portfolio-item/dr-chris-williams-ph-d-and-the-virginia-tech-dreamslab/
PAGE SCRAPED: https://www.researchgate.net/profile/Christopher-Williams-64


## **Dedictionaryify** example

In [4]:
chris.research_data['additional_websites']
dedict_websites = chris.dedictionaryify(chris.research_data['additional_websites'])
dedict_websites

['Google Scholar https://scholar.google.com/citations?user=JZJZJLwAAAAJ&hl=en',
 'https://scholar.google.com/citations?user=JZJjJLwAAAAJ&hl=en',
 'https://www.linkedin.com/in/christopher-williams-7a1a5a5/',
 'https://www.mae.vt.edu/people/faculty/williams.html',
 'DREAMS Lab https://www.dreams.me.vt.edu/',
 'Virginia Tech Mechanical Engineering Faculty Page https://www.me.vt.edu/people/faculty/williams-chris.html']

## **.deduplicate** or make your own

In [13]:
chris.deduplicate(dedict_websites)

['Google Scholar https://scholar.google.com/citations?user=JZJZJLwAAAAJ&hl=en',
 'https://www.linkedin.com/in/christopher-williams-7a1a5a5/',
 'https://www.mae.vt.edu/people/faculty/williams.html',
 'DREAMS Lab https://www.dreams.me.vt.edu/']

In [11]:
chris.deduplicate(dedict_websites,threshold=97)

['Google Scholar https://scholar.google.com/citations?user=JZJZJLwAAAAJ&hl=en',
 'https://www.linkedin.com/in/christopher-williams-7a1a5a5/',
 'https://www.mae.vt.edu/people/faculty/williams.html',
 'DREAMS Lab https://www.dreams.me.vt.edu/',
 'Virginia Tech Mechanical Engineering Faculty Page https://www.me.vt.edu/people/faculty/williams-chris.html']

## note the difference above

In [14]:
chris.research_data['awards']

[{'name': 'Senior member, National Academy of Inventors', 'year': '2022'},
 {'name': '1st Place, Society of Manufacturing Engineers (SME) Digital Manufacturing Challenge, Faculty Advisor',
  'year': '2017'},
 {'name': '2nd Place ASME Student Manufacturing Design Competition, Faculty Advisor',
  'year': '2017'},
 {'name': 'Best Paper Award, International Solid Freeform Fabrication Symposium',
  'year': '2017'},
 {'name': '1st Place, America Makes Innovation Sprint: Smart Structures',
  'year': '2016'},
 {'name': 'SXSW Best Use of User Generated Content: VT DreamVendor, Juice Pharma Worldwide',
  'year': '2016'},
 {'name': 'Best Poster Award, International Solid Freeform Fabrication Symposium',
  'year': '2015'},
 {'name': 'Faculty Fellow, Virginia Tech College of Engineering',
  'year': '2015'},
 {'name': 'Outstanding Paper Award, Emerald Publishing Rapid Prototyping Journal',
  'year': '2014'},
 {'name': 'Best Paper Award, ASME IDETC 19th Design for Manufacturing and the Life Cycle Con

In [21]:
chris.research_data

{'name': ['Chris', 'Williams'],
 'institution': ['V', 'i', 'r', 'g', 'n', 'a', ' ', 'T', 'e', 'c', 'h'],
 'gender': ['M', 'a', 'l', 'e'],
 'domain': ['A', 'c', 'd', 'e', 'm', 'i'],
 'emails': ['cbwill@vt.edu'],
 'additional_websites': ['http://afinia.com/virginia-tech-dreamslab/',
  {'title': 'DREAMS Lab', 'url': 'https://www.dreams.me.vt.edu/'},
  {'title': 'Virginia Tech Mechanical Engineering Faculty',
   'url': 'https://www.me.vt.edu/people/faculty/williams-chris.html'},
  'https://www.linkedin.com/in/christopher-williams-7a5a3a5/',
  'https://scholar.google.com/citations?user=JZJZJLcAAAAJ&hl=en'],
 'department': ['Department of Engineering Education'],
 'research_focus': ['STEM Education, K-12 Outreach, Design for Additive Manufacturing',
  'Innovation and industry partnerships in 3D printing',
  'Innovations in AM processes and materials',
  'Design methodologies and tools to guide AM use',
  'Cyber-physical security for AM'],
 'research_fields': ['Engineering Education, Additive

In [19]:
from fuzzywuzzy import fuzz
def is_content_similar(name1, name2, threshold=80):
        similarity = fuzz.token_set_ratio(name1, name2)
        return similarity >= threshold

def deduplicate(names, threshold=85):
    unique_vals = []
    for name in names:
        if not any(is_content_similar(name, unique_val, threshold) for unique_val in unique_vals):
            unique_vals.append(name)
    return unique_vals

for fields in chris.research_data:
     chris.research_data[fields] = deduplicate(chris.research_data[fields])

In [20]:
chris.research_data

{'name': ['Chris', 'Williams'],
 'institution': ['V', 'i', 'r', 'g', 'n', 'a', ' ', 'T', 'e', 'c', 'h'],
 'gender': ['M', 'a', 'l', 'e'],
 'domain': ['A', 'c', 'd', 'e', 'm', 'i'],
 'emails': ['cbwill@vt.edu'],
 'additional_websites': ['http://afinia.com/virginia-tech-dreamslab/',
  {'title': 'DREAMS Lab', 'url': 'https://www.dreams.me.vt.edu/'},
  {'title': 'Virginia Tech Mechanical Engineering Faculty',
   'url': 'https://www.me.vt.edu/people/faculty/williams-chris.html'},
  'https://www.linkedin.com/in/christopher-williams-7a5a3a5/',
  'https://scholar.google.com/citations?user=JZJZJLcAAAAJ&hl=en'],
 'department': ['Department of Engineering Education'],
 'research_focus': ['STEM Education, K-12 Outreach, Design for Additive Manufacturing',
  'Innovation and industry partnerships in 3D printing',
  'Innovations in AM processes and materials',
  'Design methodologies and tools to guide AM use',
  'Cyber-physical security for AM'],
 'research_fields': ['Engineering Education, Additive