# Setup

In [1]:
# System imports
import os
import re

# Environment variable management
from dotenv import load_dotenv

# HTTP requests
import requests

# Web scraping
from bs4 import BeautifulSoup

# NLP processing
import spacy

# Ranking algorithm
from rank_bm25 import BM25Okapi

# OpenAI API
from openai import OpenAI

# User interface
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the environment variables
load_dotenv()

# Retrieve the API key from the environment variable
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [3]:
# Define API endpoints
list_posts_url = 'https://www.techinasia.com/wp-json/techinasia/2.0/posts'
search_posts_url = 'https://www.techinasia.com/wp-json/techinasia/2.0/articles?query='

# Fetch Posts from API

In [4]:
# Function to fetch posts from API
def fetch_posts_from_api(page):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(list_posts_url, params={'page': page}, headers=headers)
        response.raise_for_status()
        print("Status Code:", response.status_code)
        # print("Response Headers:", response.headers)
        # print("Response Text:", response.text)

        post_data = response.json()

        return post_data['posts']
    
    except requests.exceptions.RequestException as e:
        print(f'Error fetching posts: {e}')
        return None
    except ValueError as e:
        print(f'Error parsing response: {e}')
        return None

In [5]:
# Fetch posts
page = 1
posts = fetch_posts_from_api(page=page)
print(f"Fetched {len(posts)} posts.")

# # Display the first post to verify the structure
# if posts:
#     print(posts[0])

Status Code: 200
Fetched 30 posts.


# Search Posts from API

In [6]:
# # Function to search posts from API
# def search_posts(query):
#     try:
#         headers = {
#             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
#         }
#         response = requests.get(search_posts_url, params={'query': query}, headers=headers)

#         if response.status_code == 200:
#             return response.json()
#         else:
#             return None
    
#     except requests.exceptions.RequestException as e:
#         print(f'Error fetching posts: {e}')
#         return None
#     except ValueError as e:
#         print(f'Error parsing response: {e}')
#         return None

In [7]:
# Function to search posts from API
def search_posts(keywords):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(search_posts_url, params={'query': keywords}, headers=headers)

        if response.status_code == 200:
            try:
                return response.json()
            except ValueError as e:
                print(f'Error parsing response JSON: {e}')
                return []
        else:
            print(f'Error fetching posts: HTTP {response.status_code}')
            return []
    
    except requests.exceptions.RequestException as e:
        print(f'Error fetching posts: {e}')
        return []

# Preprocessing

## Query Preprocessing

In [8]:
# Load the spacy model
nlp = spacy.load('en_core_web_sm')

In [9]:
def preprocess_query(query):
    doc = nlp(query)
    keywords = []
    
    for token in doc:
        if not token.is_stop and not token.is_punct:
            # Add named entities and nouns to keywords list
            if token.ent_type_ or token.pos_ in ['NOUN', 'PROPN', 'ADJ']:
                keywords.append(token.lemma_)
    
    return keywords

In [10]:
# Example query
user_query = "Tell me about Grab’s profitability and recent news."
keywords = preprocess_query(user_query)
print("Extracted Keywords:", keywords)

Extracted Keywords: ['Grab', 'profitability', 'recent', 'news']


## Retrieving Articles

In [11]:
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Handle special characters
    text = re.sub(r'&amp;', '&', text)

    # Normalise whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove source and citation tags
    text = re.sub(r'\[.*?\]+', '', text)

    return text

In [13]:
# Fetch search results
search_results = search_posts(keywords)

# Count the number of articles
if search_results and 'posts' in search_results and 'hits' in search_results['posts']:
    articles_count = len(search_results['posts']['hits'])
    print(f"Number of articles found: {articles_count}")
else:
    print("No articles found.")

# Initialize an empty list to hold the articles
articles = []

# Display the search results
if search_results and 'posts' in search_results and 'hits' in search_results['posts']:
    search_posts = search_results['posts']['hits']

    # Append each article's title and content to the articles list
    articles = [post['title'] + post['content'] for post in search_posts[:30]]

    # Apply preprocessing to each article
    cleaned_articles = [preprocess_text(article) for article in articles]
else:
    cleaned_articles = ['No relevant posts found.']

# Example to show how to use the preprocessed articles list
for idx, article in enumerate(cleaned_articles):
    print(f"Article {idx + 1}: {article}")

Number of articles found: 30
Article 1: Podcast: Rounding up SEA tech news with Momentum Works Sea and Grab recently released earnings results and as always, any financial news regarding the two Southeast Asian giants gets a lot of attention. No wonder then that the first episode of the new Momentum Works podcast focused on the two firms. The venture builder and research firm recently launched Impulso, a podcast that dives into tech trends in Asia. Featuring CEO Jianggang Li, engagement manager Sabrina Chong, and insights lead Saniya Ramchandani, the episode highlights the competition Sea faces in the ecommerce space, as well as stagnating gross merchandise value for Grab. The news roundup also touches on regulations for generative AI, and Luckin Coffee’s attempted comeback. Here are four highlights we picked out from the episode: 4:45 – Why Momentum Works’ analysts estimate that Shopee is operating with a 10% take rate 16:59 – Why Grab will probably achieve its goal of profitability i

## Selecting Most Relevant Articles using BM25

In [14]:
# Check if there are any preprocessed articles to rank
if cleaned_articles and cleaned_articles[0] != 'No relevant posts found.':
    # Tokenize the articles
    tokenized_articles = [article.split() for article in cleaned_articles]

    # Initialize BM25
    bm25 = BM25Okapi(tokenized_articles)

    # Tokenize the query
    tokenized_query = " ".join(keywords).split()

    # Get BM25 scores
    scores = bm25.get_scores(tokenized_query)
    
    # Get the indices of the top 5 articles
    top_5_indices = scores.argsort()[::-1][:5]
    
    # Select the top 5 articles based on BM25 scores
    top_5_articles = [cleaned_articles[i] for i in top_5_indices]
    
    # Display the top 5 articles
    for idx, article in enumerate(top_5_articles):
        print(f"Rank {idx + 1} Article (Score: {scores[top_5_indices[idx]]}): {article[:500]}")  # Display first 500 characters for brevity
else:
    print("No relevant posts found.")

Rank 1 Article (Score: 8.340306071774805): Podcast: Rounding up SEA tech news with Momentum Works Sea and Grab recently released earnings results and as always, any financial news regarding the two Southeast Asian giants gets a lot of attention. No wonder then that the first episode of the new Momentum Works podcast focused on the two firms. The venture builder and research firm recently launched Impulso, a podcast that dives into tech trends in Asia. Featuring CEO Jianggang Li, engagement manager Sabrina Chong, and insights lead Saniya 
Rank 2 Article (Score: 5.700262820441736): Did you expect anything but more Grab news?Sign up for the Daily Newsletter, sent exclusively to our Premium subscribers. We break down the big and messy topics of Asia’s tech and startup community. Get the newsletter in your inbox everyday with a Premium subscription. Hello readers, When scouting for players to sign on Football Manager, a simulation game that I play, my scouts will sometimes report that a pla

## Comparing Retrieval Methods

# Generate Response

In [None]:
# Function to process search results and generate background knowledge
def generate_context(search_results):
    if search_results and 'posts' in search_results and 'hits' in search_results['posts']:
        search_posts = search_results['posts']['hits']

        # background knowledge with title and content
        raw_articles = "".join([post['title'] + post['content'] for post in search_posts[:3]])
    else:
        raw_articles = 'No relevant posts found.'
    return raw_articles

In [None]:
# Function to generate responses using OpenAI
def generate_response(search_results, query):
    try:
        background_knowledge = generate_context(search_results)

        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Answer the question based on the question asked and background knowledge provided below, and if the question can't be answered based on the context, say \"I don't know\"\n\n"},
                {"role": "user", "content": f"Question: {query}\nBackground Knowledge: {background_knowledge}\nAnswer:"}
            ],
            max_tokens=100
        )
        return response.choices[0].message.content
    
    except Exception as e:
        print(f'Error generating response: {e}')
        return None

In [None]:
# # Test the response generation
# query = 'How is Grab performing in the market?'

# search_results = search_posts(query)
# generate_response(search_results, query)

# Interface

In [None]:
# # Build the Gradio interface
# def rag_interface(query):
#     search_results = search_posts(query)
#     return generate_response(search_results, query)

# iface = gr.Interface(fn=rag_interface, inputs="text", outputs="text", title='Tech in Asia RAG System', description='Ask a question about Tech in Asia and get an answer based on the context of the latest posts.')
# iface.launch()