# Setup

In [1]:
# System imports
import os
import re
from dotenv import load_dotenv

# Python libraries
import requests

# Gradio imports
import gradio as gr

# OpenAI imports
from openai import OpenAI

# BeautifulSoup imports
from bs4 import BeautifulSoup

# Spacy imports
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the environment variables
load_dotenv()

# Retrieve the API key from the environment variable
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [None]:
# Define API endpoints
list_posts_url = 'https://www.techinasia.com/wp-json/techinasia/2.0/posts'
search_posts_url = 'https://www.techinasia.com/wp-json/techinasia/2.0/articles?query='

# Fetch Posts from API

In [None]:
# Function to fetch posts from API
def fetch_posts_from_api(page):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(list_posts_url, params={'page': page}, headers=headers)
        response.raise_for_status()
        print("Status Code:", response.status_code)
        # print("Response Headers:", response.headers)
        # print("Response Text:", response.text)

        post_data = response.json()

        return post_data['posts']
    
    except requests.exceptions.RequestException as e:
        print(f'Error fetching posts: {e}')
        return None
    except ValueError as e:
        print(f'Error parsing response: {e}')
        return None

In [None]:
# Fetch posts
page = 1
posts = fetch_posts_from_api(page=page)
print(f"Fetched {len(posts)} posts.")

# # Display the first post to verify the structure
# if posts:
#     print(posts[0])

# Search Posts from API

In [None]:
# Function to search posts from API
def search_posts(query):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(search_posts_url, params={'query': query}, headers=headers)
        response.raise_for_status()
        print("Status Code:", response.status_code)
        # print("Response Headers:", response.headers)
        # print("Response Text:", response.text)

        return response.json()
    
    except requests.exceptions.RequestException as e:
        print(f'Error fetching posts: {e}')
        return None
    except ValueError as e:
        print(f'Error parsing response: {e}')
        return None

# Preprocessing

## Query Preprocessing

In [4]:
# Load the spacy model
nlp = spacy.load('en_core_web_sm')

In [5]:
def preprocess_query(query):
    doc = nlp(query)
    keywords = []
    
    for token in doc:
        if not token.is_stop and not token.is_punct:
            # Add named entities and nouns to keywords list
            if token.ent_type_ or token.pos_ in ['NOUN', 'PROPN', 'ADJ']:
                keywords.append(token.lemma_)
    
    return keywords

In [6]:
# Example query
user_query = "Tell me about Grab’s profitability and recent news."
keywords = preprocess_query(user_query)
print("Extracted Keywords:", keywords)

Extracted Keywords: ['Grab', 'profitability', 'recent', 'news']


In [None]:
# Search for posts with a keyword
query = "Apple"

In [None]:
# Fetch search results
search_results = search_posts(query)

# Display the search results
if search_results and 'posts' in search_results and 'hits' in search_results['posts']:
    search_posts = search_results['posts']['hits']

    # Articles with title and content
    raw_articles = "".join([post['title'] + post['content'] for post in search_posts[:3]])
else:
    raw_articles = 'No relevant posts found.'

## Retrieving The Most Relevant Articles

## Articles Preprocessing

In [None]:
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Handle special characters
    text = re.sub(r'&amp;', '&', text)

    # Normalise whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove source and citation tags
    text = re.sub(r'\[.*?\]+', '', text)

    return text

In [None]:
# Apply preprocessing
cleaned_articles = preprocess_text(raw_articles)
print(cleaned_articles)

# Length of cleaned content
print(f"Length of raw content: {len(raw_articles)}")
print(f"Length of cleaned content: {len(cleaned_articles)}")

## Comparing Retrieval Methods

# Generate Response

In [None]:
# Function to process search results and generate background knowledge
def generate_context(search_results):
    if search_results and 'posts' in search_results and 'hits' in search_results['posts']:
        search_posts = search_results['posts']['hits']

        # background knowledge with title and content
        raw_articles = "".join([post['title'] + post['content'] for post in search_posts[:3]])
    else:
        raw_articles = 'No relevant posts found.'
    return raw_articles

In [None]:
# Function to generate responses using OpenAI
def generate_response(search_results, query):
    try:
        background_knowledge = generate_context(search_results)

        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Answer the question based on the question asked and background knowledge provided below, and if the question can't be answered based on the context, say \"I don't know\"\n\n"},
                {"role": "user", "content": f"Question: {query}\nBackground Knowledge: {background_knowledge}\nAnswer:"}
            ],
            max_tokens=100
        )
        return response.choices[0].message.content
    
    except Exception as e:
        print(f'Error generating response: {e}')
        return None

In [None]:
# # Test the response generation
# query = 'How is Grab performing in the market?'

# search_results = search_posts(query)
# generate_response(search_results, query)

# Interface

In [None]:
# Build the Gradio interface
def rag_interface(query):
    search_results = search_posts(query)
    return generate_response(search_results, query)

iface = gr.Interface(fn=rag_interface, inputs="text", outputs="text", title='Tech in Asia RAG System', description='Ask a question about Tech in Asia and get an answer based on the context of the latest posts.')
iface.launch()