This project is an attempt to load data from MyBB based online special interest forum and run questions against the result in ChatGPT

The project is based on "Building a RAG application from scratch" video tutorial from YouTube user Underfitted - https://www.youtube.com/watch?v=BrsocJb-fAo

Step 1: Install Dependencies

In [None]:
pip install langchain langchain_openai langchain_core

In [None]:
pip install bs4

In [None]:
pip  install -U docarray

In [None]:
pip  install pydantic==1.10.9

Step 2: Import libraries

In [None]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain.prompts import ChatPromptTemplate
from langchain_community.vectorstores import DocArrayInMemorySearch

In [None]:
from bs4 import BeautifulSoup, SoupStrainer
from datetime import datetime
import requests

Step 3: Define Constants

MAX_FORUMS, MAX_THREADS_PER_FORUM - limit breadth of data load for testing purposes. Value of -1 specifies full collection.
OPENAI_API_KEY - key to ChatGPT API - to be procured and supplied to user
BASE_URL - the base url of the forum in question. The actual value is not supplied due to fair-use reason and to be supplied by a user. In the testing a special interest automotive open online forum was used.

In [None]:
MAX_FORUMS = -1
MAX_THREADS_PER_FORUM = -1

In [None]:
BASE_URL = ""
OPENAI_API_KEY=""

In [None]:
template = '''
Answer the fllowing question based on provided context. If you can't answer the question, reply "I don't know."

Context: {context}

Question: {question}

'''

Step 4: Initialize a model

In [None]:
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
parser = StrOutputParser()
prompt = ChatPromptTemplate.from_template(template)

Step 5: Define scraping code and scrape the pages

In [None]:
def scrape_page(url):
    page_to_scrape = requests.get(url)
    posts = []
    soup = BeautifulSoup(page_to_scrape.text, "html.parser")
    posts1 = soup.findAll("div", attrs={"class":"bbCodeBlock-content"})
    posts2 = soup.findAll("div", attrs={"class":"bbWrapper"})
    for txt in posts1:
        posts.append(txt.get_text())
    for txt in posts2:
        posts.append(txt.get_text())
    return posts

def find_links(base_url, page_url, prefix):
    page_to_scrape = requests.get(page_url)
    soup = BeautifulSoup(page_to_scrape.text, "html.parser")
    links_list = []
    for link in BeautifulSoup(page_to_scrape.text, "html.parser", parse_only=SoupStrainer('a')):
        if link.has_attr('href') and prefix in link['href'] and link['href'] != prefix:
            links_list.append(base_url + link['href'])
            #print(link['href'])
    links_list = list(set(links_list))
    return links_list

Run scraping process
This cells takes 1hr+ to run so do not rerun unless needed

In [None]:
start_dt = datetime.now()
print("Start time: ", start_dt)
forums_list = find_links(BASE_URL, BASE_URL + '/forums/','/forums/')
threads_list = []
forums_count = 0
for forum in forums_list:
    if MAX_FORUMS > 0 and forums_count > MAX_FORUMS:
        break
    forums_count += 1
    print("Loading links from " + forum)
    result = find_links(BASE_URL, forum,'/threads/')
    threads_list += result
posts = []
threads_count = 0
for thread in threads_list:
    if MAX_THREADS_PER_FORUM > 0 and threads_count > MAX_THREADS_PER_FORUM:
        break
    threads_count += 1
    try:
        posts += scrape_page(thread);
        print("Scraping " + thread);
    except:
        print("Scraping of " + thread + " failed")
end_dt = datetime.now()
print(len(posts), " posts in ", (datetime.timestamp(end_dt)-datetime.timestamp(start_dt)), "ms, end time:", end_dt)

Step 6: Convert scraped data to embeddings and save in a doc array

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vectorstore1 = DocArrayInMemorySearch.from_texts(posts,embedding=embeddings)
print("embdeddings stored")

Step 7: Set up retriever

In [None]:
retriever1 = vectorstore1.as_retriever()
setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
chain = setup | prompt | model | parser

Step 8: Define a question and run the chain

In [None]:
question = "How hard is it replace a head gasket

In [None]:
chain.invoke(question)