In [2]:
import requests
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import numpy as np

In [40]:
# Initial scrape of articles
faq_article_items = None
if os.path.exists('faq_raw.json'):
    try:
        with open('faq_raw.json', 'r') as file:
            faq_article_items = json.load(file)
        print("json file loaded successfully")
    except json.JSONDecodeError as e:
        print(f"Error decoding json: {e}")
else:
    url = "https://faq.enrollment.cornell.edu"
    response = requests.get(url + '/kb/')
    soup = BeautifulSoup(response.content, 'html.parser')

    article_urls = []
    for s in soup.find_all('a', class_='hf-vertically-top-aligned-container hf-subsection_title-link'):
        subgroup_url = s['href']
        subgroup_resp = requests.get(url + subgroup_url)
        subgroup_soup = BeautifulSoup(subgroup_resp.content, 'html.parser')
        breadcrumb_links = subgroup_soup.select('#main a')
        breadcrumb_text = ' > '.join([link.get_text(strip=True) for link in breadcrumb_links])
        for a in subgroup_soup.find_all('a', class_='hf-article-item-link'): 
            article_urls.append((breadcrumb_text, url + a['href']))

    faq_article_items = []
    for crumb, url in article_urls:
        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, 'html.parser')
        title = "[" + crumb + "] " + soup.find('div', class_='hf-article_title').get_text(strip=True)
        article_tag = soup.find('div', class_='hf-article_content')
        article_text = article_tag.get_text(separator=" ", strip=True)
        article_html = article_tag.prettify()
        faq_article_items.append({'Question': title, 'Answer': article_text, 'Source': url, 'Embedded': title + " " + article_text, 'html': article_html})

    with open('faq_raw.json', 'w') as f:
        json.dump(faq_article_items, f, indent=4)

    print("FAQ data saved to faq_raw.json")

json file loaded successfully


In [39]:
# Load data in to df
df = pd.DataFrame(faq_article_items)
initial_row_count = df.shape[0]
df.head()

Unnamed: 0,Question,Answer,Source,Embedded,html
0,[Home > About Cornell > Academics] What is the...,Cornell undergraduates get to know their profe...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] What is the...,"<div class=""hf-article_content"">\n <p>\n <spa..."
1,[Home > About Cornell > Academics] Can I doubl...,Cornell offers nearly 80 majors and more than ...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] Can I doubl...,"<div class=""hf-article_content"">\n <p>\n <spa..."
2,[Home > About Cornell > Academics] Does Cornel...,Several of the academic programs at Cornell of...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] Does Cornel...,"<div class=""hf-article_content"">\n <p>\n <spa..."
3,[Home > About Cornell > Academics] What is Cor...,Cornell undergraduates can get to know their p...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] What is Cor...,"<div class=""hf-article_content"">\n <p>\n <spa..."
4,[Home > About Cornell > Academics] How do I ge...,Cornell University’s Courses of Study is publi...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] How do I ge...,"<div class=""hf-article_content"">\n <p>\n <spa..."


In [42]:
### Preprocessing
# Handle Unicode characters
def replace_unicode(text):
    unicode_mapping = {
        '\u00a0': ' ',
        '\u2019': "'",
        '\u2014': '--',
        '\u2022': '*',
        '\u00a9': '(c)',
        '\u2026': '...',
        '\u2018': "'",
        '\u201C': '"',
        '\u201D': '"',
        '\u00AE': '(R)',
        '\u202F': ' ',
        '\u00B7': '.',
        '\u2039': '<',
        '\u203A': '>',
        '\u00B0': '°'
    }
    for unicode_char, replacement in unicode_mapping.items():
        text = text.replace(unicode_char, replacement)
    return text

df['Embedded'] = df['Embedded'].apply(replace_unicode)

In [43]:
# Export
df.to_json("faq_processed.json", indent=4)
df.head()

Unnamed: 0,Question,Answer,Source,Embedded,html
0,[Home > About Cornell > Academics] What is the...,Cornell undergraduates get to know their profe...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] What is the...,"<div class=""hf-article_content"">\n <p>\n <spa..."
1,[Home > About Cornell > Academics] Can I doubl...,Cornell offers nearly 80 majors and more than ...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] Can I doubl...,"<div class=""hf-article_content"">\n <p>\n <spa..."
2,[Home > About Cornell > Academics] Does Cornel...,Several of the academic programs at Cornell of...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] Does Cornel...,"<div class=""hf-article_content"">\n <p>\n <spa..."
3,[Home > About Cornell > Academics] What is Cor...,Cornell undergraduates can get to know their p...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] What is Cor...,"<div class=""hf-article_content"">\n <p>\n <spa..."
4,[Home > About Cornell > Academics] How do I ge...,Cornell University’s Courses of Study is publi...,https://faq.enrollment.cornell.edu/kb/article/...,[Home > About Cornell > Academics] How do I ge...,"<div class=""hf-article_content"">\n <p>\n <spa..."
