In [58]:
import requests
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import numpy as np

In [59]:
# Initial scrape of articles
if os.path.exists('highrise_faq.json'):
    try:
        with open('highrise_faq.json', 'r') as file:
            faq_article_items = json.load(file)
        print("json file loaded successfully")
    except json.JSONDecodeError as e:
        print(f"Error decoding json: {e}")
else:
    url = "https://support.highrise.game/en/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    article_urls = []
    for collection in soup.select('.collection-link'):  
        col_url = collection['href']
        col_resp = requests.get(col_url)
        col_soup = BeautifulSoup(col_resp.content, 'html.parser')
        for article in col_soup.select('.duration-250'): 
            article_urls.append(article['href'])
    
    faq_article_items = []
    for url in article_urls:
        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, 'html.parser')
        title = soup.select_one('.mb-1').contents[0]
        article_tag = soup.find('article')
        article_html = article_tag.prettify() 
        article_text = article_tag.get_text() 
        faq_article_items.append({'title': title, 'html': article_html, 'text': article_text, 'link': url})

    with open('highrise_faq.json', 'w') as f:
        json.dump(faq_article_items, f, indent=4)

    print("FAQ data saved to highrise_faq.json")

json file loaded successfully


In [60]:
# Auto-annotate question/answer pairs based on <p> tags that follow <h2> or <h3>, and carry other data along
faq_data = []
for item in faq_article_items:
    url = item['link']
    soup = BeautifulSoup(item['html'], 'html.parser')
    related_articles = soup.find('section', class_='jsx-62724fba150252e0 related_articles my-6')
    if related_articles:
        related_articles.decompose()
 
    if len(soup.find_all('h2')) > 0 or len(soup.find_all('h3')) > 0:
        for h2 in soup.find_all('h2'):    # Assuming <h2> is a question
            # Find all <p> tags after the current <h2>
            paragraphs = []
            paragraphs_html = []
            for sibling in h2.find_all_next():
                if sibling.name == 'h2' or sibling.name == 'h3':  
                    break
                if sibling.name == 'p': 
                    paragraphs.append(sibling.get_text(strip=True))
                    paragraphs_html.append(str(sibling))
            faq_data.append({"Question": h2.get_text(strip=True), "Answer": " ".join(paragraphs), "Source": url, "html": paragraphs_html, "Embedded": item['title']})
        for h3 in soup.find_all('h3'):    # Assuming <h2> is a question
            # Find all <p> tags after the current <h2>
            paragraphs = []
            paragraphs_html = []
            for sibling in h3.find_all_next():
                if sibling.name == 'h2' or sibling.name == 'h3': 
                    break
                if sibling.name == 'p': 
                    paragraphs.append(sibling.get_text(strip=True))
                    paragraphs_html.append(str(sibling))
            faq_data.append({"Question": h3.get_text(strip=True), "Answer": " ".join(paragraphs), "Source": url, "html": paragraphs_html})
    else:
        faq_data.append({"Question": item['title'], "Answer": soup.get_text(separator=" ", strip=True), "Source": url, "html": soup, "Embedded": "notitle"})
    


In [61]:
# Load data in to df
df = pd.DataFrame(faq_data)
initial_row_count = df.shape[0]
df.head()

Unnamed: 0,Question,Answer,Source,html,Embedded
0,What is Pinning?,Imagine being a wizard in your own app kingdom...,https://support.highrise.game/en/articles/8894...,[<p>\n Imagine being a wizard in your own ap...,📌Pinning
1,📍 Where can I use Pinning?,We've added Pinning to anywhere something can ...,https://support.highrise.game/en/articles/8894...,[<p>\n We've added Pinning to anywhere somet...,📌Pinning
2,Posts:,Want to show one of your posts at the top of y...,https://support.highrise.game/en/articles/8894...,[<p>\n Want to show one of your posts at the...,📌Pinning
3,Comments:,Has someone left you a nice comment you'd like...,https://support.highrise.game/en/articles/8894...,[<p>\n Has someone left you a nice comment y...,📌Pinning
4,Creations:,Fancy pinning some of your best work? No worri...,https://support.highrise.game/en/articles/8894...,[<p>\n Fancy pinning some of your best work?...,📌Pinning


In [62]:
### Preprocessing
# Remove non-ASCII characters
df["Question"] = df["Question"].apply(lambda x: x.encode("ascii", "ignore").decode("ascii") if isinstance(x, str) else x)
df["Answer"] = df["Answer"].apply(lambda x: x.encode("ascii", "ignore").decode("ascii") if isinstance(x, str) else x)
df["Embedded"] = df["Embedded"].apply(lambda x: x.encode("ascii", "ignore").decode("ascii") if isinstance(x, str) else x)

# Strip whitespace
df["Question"] = df["Question"].str.strip().str.replace(r"^(?:Q:)?[\n\t]+", " ", regex=True)
df["Answer"] = df["Answer"].str.strip().str.replace(r"^(?:A:)?[\n\t]+", " ", regex=True)
df["Embedded"] = df["Embedded"].str.strip().str.replace(r"[\n\t]+", " ", regex=True)

# # Drop rows with missing or empty values
df.dropna(inplace=True)  # Drop rows with NaN values
df = df[df["Question"] != ""]  # Drop rows with empty questions
df = df[df["Answer"] != ""]  # Drop rows with empty answers
df["Embedded"] = df["Embedded"].str.replace("notitle", "", regex=True)

# # Semi-manual edits from this point -- mostly question titles (to avoid duplicates)
df.loc[2, 'Question'] = "How do I Pin posts?"
df.loc[3, 'Question'] = "How do I Pin commments?"
df.loc[4, 'Question'] = "How do I Pin creations?"
df.loc[5, 'Question'] = "How do I Pin showcases?"
df.loc[10, 'Question'] = "How does the Delete Option work?"
df.loc[336, 'Question'] = "What is Error 113?"
df.loc[337, 'Question'] = "What is Error 114?"
df.loc[338, 'Question'] = "What is Error 116?"
df.loc[339, 'Question'] = "What is Error 118?"
df.loc[340, 'Question'] = "What is Error 119?"
df.loc[342, 'Question'] = "What is Error 120?"
df.loc[343, 'Question'] = "What is Connection Error?"
df.loc[370, 'Question'] = "More info for Highrise Concepts?"
df.loc[402, 'Question'] = "Style Challenge tips?"
df.loc[403, 'Question'] = "How to participate in Prank Events?"
df.loc[404, 'Question'] = "How does scoring work in Prank Events?"
df.loc[405, 'Question'] = "Tips for Prank Events?"
df.loc[407, 'Question'] = "How to participate in Donate Events?"
df.loc[409, 'Question'] = "How does scoring work in Donate Events?"
df.loc[410, 'Question'] = "Tips for Donate Events?"

df["Embedded"] = "[" + df["Embedded"] + "] " + df["Question"] + " " + df["Answer"]
df["Embedded"] = df["Embedded"].str.replace(r'\[\]', '', regex=True)

In [63]:
if df.shape[0] < initial_row_count:
    df.drop([9], inplace=True)


In [64]:
# Export
df.reset_index(inplace=True, drop=True)
df.to_json("faq_processed.json", indent=4)
df.head()

Unnamed: 0,Question,Answer,Source,html,Embedded
0,What is Pinning?,Imagine being a wizard in your own app kingdom...,https://support.highrise.game/en/articles/8894...,[<p>\n Imagine being a wizard in your own ap...,[Pinning] What is Pinning? Imagine being a wiz...
1,Where can I use Pinning?,We've added Pinning to anywhere something can ...,https://support.highrise.game/en/articles/8894...,[<p>\n We've added Pinning to anywhere somet...,[Pinning] Where can I use Pinning? We've added...
2,How do I Pin posts?,Want to show one of your posts at the top of y...,https://support.highrise.game/en/articles/8894...,[<p>\n Want to show one of your posts at the...,[Pinning] How do I Pin posts? Want to show one...
3,How do I Pin commments?,Has someone left you a nice comment you'd like...,https://support.highrise.game/en/articles/8894...,[<p>\n Has someone left you a nice comment y...,[Pinning] How do I Pin commments? Has someone ...
4,How do I Pin creations?,Fancy pinning some of your best work? No worri...,https://support.highrise.game/en/articles/8894...,[<p>\n Fancy pinning some of your best work?...,[Pinning] How do I Pin creations? Fancy pinnin...
