# 1 Setup

In [1]:
# System imports
import os
import re

# Environment variable management
from dotenv import load_dotenv

# HTTP requests
import requests

# Web scraping
from bs4 import BeautifulSoup

# NLP processing
import spacy

# Load the spacy model
nlp = spacy.load('en_core_web_sm')

# Ranking algorithm
from rank_bm25 import BM25Okapi

# OpenAI API
from openai import OpenAI

# User interface
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the environment variables
load_dotenv()

# Retrieve the API key from the environment variable
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [3]:
# Define API endpoints
list_posts_url = 'https://www.techinasia.com/wp-json/techinasia/2.0/posts'
search_posts_url = 'https://www.techinasia.com/wp-json/techinasia/2.0/articles?query='

# 2 Data Retrieval

## 2.1 Fetch Post by Page

In [4]:
# Function to fetch posts from API
def fetch_posts(page):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(list_posts_url, params={'page': page}, headers=headers)

        if response.status_code == 200:
            try:
                return response.json()
            except ValueError as e:
                print(f'Error parsing response JSON: {e}')
                return []
        else:
            print(f'Error fetching posts: HTTP {response.status_code}')
            return []
    
    except requests.exceptions.RequestException as e:
        print(f'Error fetching posts: {e}')
        return []

## 2.2 Search Post by Keywords

In [5]:
# Function to search posts from API
def search_posts(keywords):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(search_posts_url, params={'query': keywords}, headers=headers)

        if response.status_code == 200:
            try:
                return response.json()
            except ValueError as e:
                print(f'Error parsing response JSON: {e}')
                return []
        else:
            print(f'Error fetching posts: HTTP {response.status_code}')
            return []
    
    except requests.exceptions.RequestException as e:
        print(f'Error fetching posts: {e}')
        return []

# 3 Preprocessing

## 3.1 Determining Which API Call to Use

In [6]:
# Define entity types to ignore
ignore_entities = {'DATE', 'TIME'}

def classify_query(query):
    # Process the question with spaCy's NER model
    doc = nlp(query)
    # Debugging: Print detected entities and their labels
    for ent in doc.ents:
        print(f"Entity: {ent.text}, Label: {ent.label_}")
    # Check if any entities are present that are not in the ignore list
    if any(ent.label_ not in ignore_entities for ent in doc.ents):
        return "specific query"
    else:
        return "general query"

# Example questions
queries = [
    "Tell me about Grab’s profitability.",
    "Give me a history of HonestBee.",
    "What are the top startups in Indonesia?",
    "What are the upcoming Events at Tech in Asia?",
    "What's happening today?"
]

# Classify each question
classified_queries = {query: classify_query(query) for query in queries}

# Display the results
for query, category in classified_queries.items():
    print(f"Query: {query}\nCategory: {category}\n")

Entity: Grab, Label: NORP
Entity: HonestBee, Label: ORG
Entity: Indonesia, Label: GPE
Entity: Asia, Label: LOC
Entity: today, Label: DATE
Query: Tell me about Grab’s profitability.
Category: specific query

Query: Give me a history of HonestBee.
Category: specific query

Query: What are the top startups in Indonesia?
Category: specific query

Query: What are the upcoming Events at Tech in Asia?
Category: specific query

Query: What's happening today?
Category: general query



## 3.2 Extracting Keywords from User Query

In [7]:
def preprocess_query(query):
    doc = nlp(query)
    keywords = []
    
    for token in doc:
        if not token.is_stop and not token.is_punct:
            # Add named entities and nouns to keywords list
            if token.ent_type_ or token.pos_ in ['NOUN', 'PROPN', 'ADJ']:
                keywords.append(token.lemma_)
    
    return keywords

In [8]:
# Example query
query = "What’s happening today?"
keywords = preprocess_query(query)
print("Extracted Keywords:", keywords)
classify_query(query)

# query = "Tell me about Grab’s profitability."
# keywords = preprocess_query(query)
# print("Extracted Keywords:", keywords)

# query = "Give me a history of HonestBee."
# keywords = preprocess_query(query)
# print("Extracted Keywords:", keywords)

# query = "What are the top startups in Indonesia?"
# keywords = preprocess_query(query)
# print("Extracted Keywords:", keywords)

# query = "What are the upcoming Events at Tech in Asia?"
# keywords = preprocess_query(query)
# print("Extracted Keywords:", keywords)

Extracted Keywords: ['today']
Entity: today, Label: DATE


'general query'

## 3.3 Extracting Relevant Articles

### 3.3.1 Retrieving All Articles

In [9]:
# Function to clean up the article content
def preprocess_article(article):
    # Remove HTML tags
    article = BeautifulSoup(article, 'html.parser').get_text()

    # Remove URLs
    article = re.sub(r'http\S+', '', article)

    # Handle special characters
    article = re.sub(r'&amp;', '&', article)

    # Normalise whitespace
    article = re.sub(r'\s+', ' ', article)

    # Remove source and citation tags
    article = re.sub(r'\[.*?\]+', '', article)

    return article

In [10]:
def fetch_and_preprocess_articles(query_type, search_results):
    # Initialize an empty list to hold the articles
    articles = []

    if query_type == "specific query":
        # Display the search results
        if search_results and 'posts' in search_results and 'hits' in search_results['posts']:
            search_posts = search_results['posts']['hits']

            # Append each article's title and content to the articles list
            articles = [post['title'] + post['content'] for post in search_posts[:30]]

            # Apply preprocessing to each article
            cleaned_articles = [preprocess_article(article) for article in articles]
        else:
            cleaned_articles = ['No relevant posts found.']
    else:
         # Display the search results
        if search_results and 'posts' in search_results:
            search_posts = search_results['posts']

            # Append each article's title and content to the articles list
            articles = [post['title'] + post['content'] for post in search_posts[:30]]

            # Apply preprocessing to each article
            cleaned_articles = [preprocess_article(article) for article in articles]
        else:
            cleaned_articles = ['No relevant posts found.']


    return cleaned_articles

In [11]:
# Count the number of articles in the search results
def count_articles(search_results):
    if search_results and 'posts' in search_results and 'hits' in search_results['posts']:
        articles_count = len(search_results['posts']['hits'])
        return articles_count
    return 0

In [12]:
# Display articles
def display_articles(articles):
    for idx, article in enumerate(articles):
        print(f"Article {idx + 1}: {article}")

In [13]:
if classify_query(query) == "specific query":
    # Search search 
    search_results = search_posts(keywords)
    query_type = "specific query"
else:
    # Fetch search
    search_results = fetch_posts(page=1)
    query_type = "general query"

# Count the number of articles
articles_count = count_articles(search_results)
print(f"Number of articles found: {articles_count}")

# Fetch and preprocess articles
cleaned_articles = fetch_and_preprocess_articles(query_type, search_results)

# Display the preprocessed articles
# display_articles(cleaned_articles)

Entity: today, Label: DATE
Number of articles found: 0


### 3.3.2 Ranking Articles with BM25

In [14]:
def rank_articles(cleaned_articles, keywords):
    if cleaned_articles and cleaned_articles[0] != 'No relevant posts found.':
        # Tokenize the articles
        tokenized_articles = [article.split() for article in cleaned_articles]

        # Initialize BM25
        bm25 = BM25Okapi(tokenized_articles)

        # Tokenize the query
        tokenized_query = " ".join(keywords).split()

        # Get BM25 scores
        scores = bm25.get_scores(tokenized_query)
        
        # Get the indices of the top 5 articles
        top_5_indices = scores.argsort()[::-1][:5]
        
        # Select the top 5 articles based on BM25 scores
        top_5_articles = [cleaned_articles[i] for i in top_5_indices]

        # Display the top 5 articles
        for idx, article in enumerate(top_5_articles):
            # Display first 100 characters for brevity
            print(f"Rank {idx + 1} Article (Score: {scores[top_5_indices[idx]]}): {article[:100]}")

        return top_5_articles
    else:
        print("No relevant articles found.")
        return []

In [15]:
# Rank articles
top_5_articles = rank_articles(cleaned_articles, keywords)

Rank 1 Article (Score: 2.8908982794513416): Vietnam’s tech future isn’t waiting. Are you?Vietnam’s tech scene is on the cusp of something big. D
Rank 2 Article (Score: 1.2375700325868246): Singapore’s new PM has his work cut out for himSign up for the Daily Newsletter, sent exclusively to
Rank 3 Article (Score: 0.0): Glasswall: The things anonymous founders say about VCs In the startup world, choosing the right inve
Rank 4 Article (Score: 0.0): These are the most active investors in Israel’s startupsWhich investors are the most active in Israe
Rank 5 Article (Score: 0.0): India’s Zypp Electric nets $15m to expand EV fleet Photo credit: Zypp Electric India-based Zypp Elec


## 3.4 Comparing Retrieval Methods

# 4 RAG Implementation

## 4.1 Generate Context for LLM

In [16]:
def generate_background_knowledge(top_articles):
    # Ensure top_articles is a list of strings
    if not isinstance(top_articles, list):
        raise TypeError("top_articles must be a list")
    if not all(isinstance(article, str) for article in top_articles):
        raise TypeError("All elements in top_articles must be strings")
    
    # Combine the content of the top articles into a single string
    background_knowledge = "\n\n".join(top_articles)
    
    return background_knowledge

In [17]:
# Generate background knowledge
background_knowledge = generate_background_knowledge(top_5_articles)

## 4.2 Generate Response via OpenAI

In [18]:
# Function to generate responses using OpenAI
def generate_response(query, background_knowledge):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "Answer the question based on the question asked and background knowledge provided below"},
                {"role": "user", "content": f"Question: {query}\nBackground Knowledge: {background_knowledge}\nAnswer:"}
            ],
            max_tokens=150
        )
        return response.choices[0].message.content
    
    except Exception as e:
        print(f'Error generating response: {e}')
        return None

In [19]:
# generate_response(query, background_knowledge)

## 4.3 Gradio Interface

In [21]:
# Build the Gradio interface
def rag_interface(query):
    keywords = preprocess_query(query)

    if classify_query(query) == "specific query":
        # Search search 
        search_results = search_posts(keywords)
        query_type = "specific query"
    else:
        # Fetch search
        search_results = fetch_posts(page=1)
        query_type = "general query"

    cleaned_articles = fetch_and_preprocess_articles(query_type, search_results)
    top_5_articles = rank_articles(cleaned_articles, keywords)
    background_knowledge = generate_background_knowledge(top_5_articles)
    
    return generate_response(query, background_knowledge)

iface = gr.Interface(fn=rag_interface, inputs="text", outputs="text", title='Tech in Asia RAG System', description='Ask a question about Tech in Asia and get an answer based on the context of the latest posts.')
# iface.launch()