# 1 Setup

In [1]:
# System imports
import os
import re

# Environment variable management
from dotenv import load_dotenv

# HTTP requests
import requests

# Web scraping
from bs4 import BeautifulSoup

# NLP processing
import spacy

# Ranking algorithm
from rank_bm25 import BM25Okapi

# OpenAI API
from openai import OpenAI

# User interface
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the environment variables
load_dotenv()

# Retrieve the API key from the environment variable
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [3]:
# Define API endpoints
list_posts_url = 'https://www.techinasia.com/wp-json/techinasia/2.0/posts'
search_posts_url = 'https://www.techinasia.com/wp-json/techinasia/2.0/articles?query='

# 2 Data Retrieval

## 2.1 Fetch Post by Page

In [4]:
# Function to fetch posts from API
def fetch_posts_from_api(page):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(list_posts_url, params={'page': page}, headers=headers)
        response.raise_for_status()
        print("Status Code:", response.status_code)
        # print("Response Headers:", response.headers)
        # print("Response Text:", response.text)

        post_data = response.json()

        return post_data['posts']
    
    except requests.exceptions.RequestException as e:
        print(f'Error fetching posts: {e}')
        return None
    except ValueError as e:
        print(f'Error parsing response: {e}')
        return None

In [5]:
# Fetch posts
page = 1
posts = fetch_posts_from_api(page=page)
print(f"Fetched {len(posts)} posts.")

# # Display the first post to verify the structure
# if posts:
#     print(posts[0])

Status Code: 200
Fetched 30 posts.


## 2.2 Search Post by Keywords

In [6]:
# Function to search posts from API
def search_posts(keywords):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(search_posts_url, params={'query': keywords}, headers=headers)

        if response.status_code == 200:
            try:
                return response.json()
            except ValueError as e:
                print(f'Error parsing response JSON: {e}')
                return []
        else:
            print(f'Error fetching posts: HTTP {response.status_code}')
            return []
    
    except requests.exceptions.RequestException as e:
        print(f'Error fetching posts: {e}')
        return []

# 3 Preprocessing

## 3.1 Extracting Keywords from User Query

In [7]:
# Load the spacy model
nlp = spacy.load('en_core_web_sm')

In [8]:
def preprocess_query(query):
    doc = nlp(query)
    keywords = []
    
    for token in doc:
        if not token.is_stop and not token.is_punct:
            # Add named entities and nouns to keywords list
            if token.ent_type_ or token.pos_ in ['NOUN', 'PROPN', 'ADJ']:
                keywords.append(token.lemma_)
    
    return keywords

In [9]:
# Example query
query = "What’s happening today?"
keywords = preprocess_query(query)
print("Extracted Keywords:", keywords)

query = "Tell me about Grab’s profitability."
keywords = preprocess_query(query)
print("Extracted Keywords:", keywords)

# query = "Give me a history of HonestBee."
# keywords = preprocess_query(query)
# print("Extracted Keywords:", keywords)

# query = "What are the top startups in Indonesia?"
# keywords = preprocess_query(query)
# print("Extracted Keywords:", keywords)

# query = "What are the upcoming Events at Tech in Asia?"
# keywords = preprocess_query(query)
# print("Extracted Keywords:", keywords)

Extracted Keywords: ['today']
Extracted Keywords: ['Grab', 'profitability']


## 3.2 Extracting Relevant Articles

### 3.2.1 Retrieving All Articles

In [10]:
# Function to clean up the article content
def preprocess_article(article):
    # Remove HTML tags
    article = BeautifulSoup(article, 'html.parser').get_text()

    # Remove URLs
    article = re.sub(r'http\S+', '', article)

    # Handle special characters
    article = re.sub(r'&amp;', '&', article)

    # Normalise whitespace
    article = re.sub(r'\s+', ' ', article)

    # Remove source and citation tags
    article = re.sub(r'\[.*?\]+', '', article)

    return article

In [11]:
def fetch_and_preprocess_articles(search_results):
    # Initialize an empty list to hold the articles
    articles = []

    # Display the search results
    if search_results and 'posts' in search_results and 'hits' in search_results['posts']:
        search_posts = search_results['posts']['hits']

        # Append each article's title and content to the articles list
        articles = [post['title'] + post['content'] for post in search_posts[:30]]

        # Apply preprocessing to each article
        cleaned_articles = [preprocess_article(article) for article in articles]
    else:
        cleaned_articles = ['No relevant posts found.']

    return cleaned_articles

In [12]:
# Count the number of articles in the search results
def count_articles(search_results):
    if search_results and 'posts' in search_results and 'hits' in search_results['posts']:
        articles_count = len(search_results['posts']['hits'])
        return articles_count
    return 0

In [13]:
# Display articles
def display_articles(articles):
    for idx, article in enumerate(articles):
        print(f"Article {idx + 1}: {article}")

In [14]:
# Fetch search results
search_results = search_posts(keywords)

# Count the number of articles
articles_count = count_articles(search_results)
print(f"Number of articles found: {articles_count}")

# Fetch and preprocess articles
cleaned_articles = fetch_and_preprocess_articles(search_results)

# Display the preprocessed articles
# display_articles(cleaned_articles)

Number of articles found: 30


### 3.2.2 Ranking Articles with BM25

In [15]:
def rank_articles(cleaned_articles, keywords):
    if cleaned_articles and cleaned_articles[0] != 'No relevant posts found.':
        # Tokenize the articles
        tokenized_articles = [article.split() for article in cleaned_articles]

        # Initialize BM25
        bm25 = BM25Okapi(tokenized_articles)

        # Tokenize the query
        tokenized_query = " ".join(keywords).split()

        # Get BM25 scores
        scores = bm25.get_scores(tokenized_query)
        
        # Get the indices of the top 5 articles
        top_5_indices = scores.argsort()[::-1][:5]
        
        # Select the top 5 articles based on BM25 scores
        top_5_articles = [cleaned_articles[i] for i in top_5_indices]

        # Display the top 5 articles
        for idx, article in enumerate(top_5_articles):
            # Display first 100 characters for brevity
            print(f"Rank {idx + 1} Article (Score: {scores[top_5_indices[idx]]}): {article[:100]}")

        return top_5_articles
    else:
        print("No relevant articles found.")
        return []

In [16]:
# Rank articles
top_5_articles = rank_articles(cleaned_articles, keywords)

Rank 1 Article (Score: 2.9279612297320794): Grab posts $525m in Q1 revenue, eyes profitability by Q4 Grab's revenue in Q1 2023 jumped 130% year 
Rank 2 Article (Score: 2.404540115015261): Tracing Uber’s and Grab’s path to profitabilitySign up for the Daily Newsletter, sent exclusively to
Rank 3 Article (Score: 2.19098161571723): GoTo’s on-demand segment nears its profitability destinationSign up for the Daily Newsletter, sent e
Rank 4 Article (Score: 2.0630338974564078): GrabKitchen falls victim to super app’s profitability pushWelcome to the Opening Bell 🔔! Delivered e
Rank 5 Article (Score: 1.852005715144181): Indonesian beauty firm spills the beans on profitabilitySign up for the Daily Newsletter, sent exclu


## 3.3 Summarisation

In [17]:
# from transformers import pipeline

# # Load a summarization pipeline
# summarizer = pipeline("summarization")

# def summarize_article(article, max_length=150):
#     summary = summarizer(article, max_length=max_length, min_length=30, do_sample=False)
#     return summary[0]['summary_text']

# def generate_background_knowledge(top_articles):
#     """
#     Generates background knowledge by summarizing and combining the content of the top articles.
    
#     Parameters:
#     top_articles (list of str): List of articles' content.
    
#     Returns:
#     str: Combined content of the summarized top articles.
#     """
#     summarized_articles = [summarize_article(article) for article in top_articles]
#     background_knowledge = "\n\n".join(summarized_articles)
    
#     return background_knowledge


# background_knowledge = generate_background_knowledge(top_5_articles)
# print(background_knowledge)

## Comparing Retrieval Methods

# 4 RAG Implementation

## 4.1 Generate Context for LLM

In [18]:
def generate_background_knowledge(top_articles):
    # Ensure top_articles is a list of strings
    if not isinstance(top_articles, list):
        raise TypeError("top_articles must be a list")
    if not all(isinstance(article, str) for article in top_articles):
        raise TypeError("All elements in top_articles must be strings")
    
    # Combine the content of the top articles into a single string
    background_knowledge = "\n\n".join(top_articles)
    
    return background_knowledge

In [19]:
background_knowledge = generate_background_knowledge(top_5_articles)
print(background_knowledge)

Grab posts $525m in Q1 revenue, eyes profitability by Q4 Grab's revenue in Q1 2023 jumped 130% year on year to US$525 million, which the Singapore tech giant attributed to growth across all its segments. The company also reduced its adjusted losses for the quarter by 77% year on year, landing at US$66 million. "With five sequential quarters of adjusted EBITDA improvements, we remain on track on our path to profitability and to achieve group adjusted EBITDA breakeven in the fourth quarter of this year," said Anthony Tan, group CEO and co-founder of Grab. Deliveries were the star of the show for the company in Q1, contributing US$275 million in revenue. This represents a 217% leap over the same year-ago period. Meanwhile, segment gross merchandise value (GMV) dropped 4% year on year to US$2.3 billion. Grab said its growth in revenues comes mainly on the back of contributions from Jaya Grocer - which it acquired in 2021 - as well as a reduction in incentives. The company also said that Gr

## 4.2 Generate Response via OpenAI

In [20]:
# Function to generate responses using OpenAI
def generate_response(query, background_knowledge):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Answer the question based on the question asked and background knowledge provided below, and if the question can't be answered based on the context, say \"I don't know\"\n\n"},
                {"role": "user", "content": f"Question: {query}\nBackground Knowledge: {background_knowledge}\nAnswer:"}
            ],
            max_tokens=100
        )
        return response.choices[0].message.content
    
    except Exception as e:
        print(f'Error generating response: {e}')
        return None

In [21]:
generate_response(query, background_knowledge)

"Grab's profitability has been improving, with the company posting a revenue of $525 million in Q1 2023 and reducing its adjusted losses by 77% compared to the previous year. The company aims to achieve group adjusted EBITDA breakeven in the fourth quarter of the same year. Grab has seen growth in various segments such as deliveries, mobility, and financial services, indicating a positive trajectory towards profitability."

## 4.3 Gradio Interface

In [22]:
# # Build the Gradio interface
# def rag_interface(query):
#     search_results = search_posts(query)
#     return generate_response(search_results, query)

# iface = gr.Interface(fn=rag_interface, inputs="text", outputs="text", title='Tech in Asia RAG System', description='Ask a question about Tech in Asia and get an answer based on the context of the latest posts.')
# iface.launch()