### Check a random url

In [23]:
import json
import random
import os

# List of URLs to check
check_urls = [
    "https://www.jewelchangiairport.com/en/tourist-perks-and-promotions.html",
    # Add more URLs as needed
]

# Path to scraped data
data_file = "scraped_data/changi_english_data.json"

# Read scraped data
if not os.path.exists(data_file):
    print(f"Error: {data_file} not found. Run the Scrapy spider first.")
    exit(1)

with open(data_file, "r", encoding="utf-8") as f:
    scraped_data = json.load(f)

# Get URLs from scraped data
scraped_urls = [page["url"] for page in scraped_data]

# Randomly select a URL to check
random_url = random.choice(check_urls)

# Check if random URL is in scraped data
if random_url in scraped_urls:
    print(f"Random URL {random_url} was FOUND in the scraped data.")
else:
    print(f"Random URL {random_url} was NOT found in the scraped data.")

Random URL https://www.jewelchangiairport.com/en/tourist-perks-and-promotions.html was FOUND in the scraped data.


## Data Preprocessing

### Filter out non English text

In [17]:
import json
import os
from langdetect import detect, DetectorFactory
import langdetect

# Ensure consistent language detection
DetectorFactory.seed = 0

# Input and output files
input_file = "scraped_data/changi_all_data.json"
output_file = "scraped_data/changi_english_data.json"

# Check if input file exists
if not os.path.exists(input_file):
    print(f"Error: {input_file} not found. Run the Scrapy spider first.")
    exit(1)

# Read scraped data
with open(input_file, "r", encoding="utf-8") as f:
    scraped_data = json.load(f)

# Filter English content
english_data = []
for page in scraped_data:
    url = page["url"]
    content = page["content"]
    english_content = []
    
    for text in content:
        try:
            # Detect language; keep only English text
            if detect(text) == "en":
                english_content.append(text)
        except langdetect.lang_detect_exception.LangDetectException:
            # Skip text too short or undetectable
            continue
    
    if english_content:  # Only include pages with English content
        english_data.append({"url": url, "content": english_content})

# Save English-only data
os.makedirs("scraped_data", exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(english_data, f, indent=2)
print(f"English content saved to {output_file} ({len(english_data)} pages)")

English content saved to scraped_data/changi_english_data.json (588 pages)


### Considering numbers

In [22]:
import json
import os
import re
from langdetect import detect, DetectorFactory
import langdetect

# Ensure consistent language detection
DetectorFactory.seed = 0

# Input and output files
input_file = "scraped_data/changi_all_data.json"
output_file = "scraped_data/changi_english_and_numbers_data.json"

# Check if input file exists
if not os.path.exists(input_file):
    print(f"Error: {input_file} not found. Run the Scrapy spider first.")
    exit(1)

# Read scraped data
with open(input_file, "r", encoding="utf-8") as f:
    scraped_data = json.load(f)

# Function to check if text is primarily numeric
def is_numeric(text):
    # Remove whitespace and check if text contains mostly numbers
    cleaned_text = text.strip()
    # Matches numbers, decimals, or numbers with common symbols (e.g., $12.99, 12:30)
    return bool(re.match(r'^[\d\s.,:;$-]+$', cleaned_text))

# Filter English and numeric content
english_data = []
for page in scraped_data:
    url = page["url"]
    content = page["content"]
    filtered_content = []
    
    for text in content:
        try:
            # Keep numeric text or English text
            if is_numeric(text) or detect(text) == "en":
                filtered_content.append(text)
        except langdetect.lang_detect_exception.LangDetectException:
            # Keep numeric text even if language detection fails
            if is_numeric(text):
                filtered_content.append(text)
    
    if filtered_content:  # Only include pages with filtered content
        english_data.append({"url": url, "content": filtered_content})

# Save filtered data
os.makedirs("scraped_data", exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(english_data, f, indent=2)
print(f"English and numeric content saved to {output_file} ({len(english_data)} pages)")

English and numeric content saved to scraped_data/changi_english_and_numbers_data.json (594 pages)


### Data Cleaning

In [1]:
import json
import os
import re

# Input and output files
input_file = "scraped_data/changi_english_data_2.json"
output_file = "scraped_data/changi_cleaned_data.json"

# Check if input file exists
if not os.path.exists(input_file):
    print(f"Error: {input_file} not found. Run the filtering script first.")
    exit(1)

# Read scraped data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Function to clean and normalize text
def clean_text(text):
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text.strip())
    # Remove special characters, keep alphanumeric, numbers, and common punctuation
    text = re.sub(r'[^\w\s.,:;$-]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Clean data
cleaned_data = []
for page in data:
    url = page["url"]
    content = page["content"]
    cleaned_content = []
    
    for text in content:
        # Clean and normalize text
        cleaned = clean_text(text)
        # Filter out short or irrelevant text (e.g., < 3 characters or single words)
        if len(cleaned) >= 3 and len(cleaned.split()) > 1:
            cleaned_content.append(cleaned)
    
    if cleaned_content:  # Only include pages with cleaned content
        cleaned_data.append({"url": url, "content": cleaned_content})

# Save cleaned data
os.makedirs("scraped_data", exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2)
print(f"Cleaned data saved to {output_file} ({len(cleaned_data)} pages)")

Cleaned data saved to scraped_data/changi_cleaned_data.json (579 pages)


### Chunking and Adding metadata

In [9]:
import os
import json
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

from dotenv import load_dotenv

In [42]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("your-pinecone-api-key")
PINECONE_ENV = "us-west1-gcp"
INDEX_NAME = "changiindex"  # change as needed

In [30]:
with open("scraped_data/changi_cleaned_data.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

In [31]:
# Combine content into one string per document
documents = []
for entry in raw_data:
    url = entry["url"]
    content = entry.get("content", [])
    text = " ".join(content).strip()
    if text:
        documents.append(Document(page_content=text, metadata={"source": url}))


In [32]:
# ✂️ Chunking the Documents
# ---------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200,
)
split_docs = text_splitter.split_documents(documents)

print(f"✅ Loaded and split {len(split_docs)} chunks.")


✅ Loaded and split 3324 chunks.


In [40]:
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings

# Load MiniLM
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)


  embedding_model = HuggingFaceEmbeddings(model_name=model_name)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [34]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("changi")

In [43]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings


vectorstore = PineconeVectorStore.from_documents(
    documents=split_docs,
    embedding=embedding_model,
    index_name=INDEX_NAME,
    pinecone_api_key=PINECONE_API_KEY,
)

print("✅ Successfully indexed using Pinecone v3 + langchain-pinecone.")



✅ Successfully indexed using Pinecone v3 + langchain-pinecone.


In [44]:
index.describe_index_stats

<bound method Index.describe_index_stats of <pinecone.db_data.index.Index object at 0x0000016DA2DBA8A0>>

In [56]:
# Sample query
sample_vector = embedding_model.embed_query("changai airport holiday plans")
result = vectorstore.similarity_search(query="changai airport holiday plans",k=3)
print(result)


  return forward_call(*args, **kwargs)


[Document(id='12cd8641-2e1b-4af5-8af9-e8dd2279d39c', metadata={'source': 'https://www.changiairport.com/en/at-changi/facilities-and-services-directory/passenger-meeting-services.html'}, page_content='and departures, ensuring a memorable beginning and end to your journey in singapore. upon arrival, passengers can expect a personalised greeting at the gate, luggage and immigration assistance for a hassle-free process, and coordination of onward transportation as needed. for departures, our services include guidance through check-in and security, and escort to the departure gate. these passenger meeting services are designed to cater to the needs of all travellers, whether youre visiting for business or leisure, ensuring a smooth and enjoyable experience at changi airport. to request our meet and greet services, passengers or their representatives can easily make a booking through our official changi airport website or the changi airport mobile app. when booking, please fill in the'), Doc

### Rag pipeline

In [58]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Init embedding and vectorstore
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = PineconeVectorStore.from_existing_index(
    index_name="changiindex",
    embedding=embedding_model,
    
)

# Create the retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Init OpenAI Chat Model
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name="gpt-3.5-turbo",  # or "gpt-4"
    temperature=0.2
)

# Build RetrievalQA chain (RAG pipeline)
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# Run chatbot loop
while True:
    query = input("\n💬 You: ")
    if query.lower() in ["exit", "quit"]:
        break

    result = rag_chain(query)
    print(f"\n🤖 Bot: {result['result']}")
    
    # (Optional) Show sources
    print("\n📄 Sources:")
    for doc in result["source_documents"]:
        print("-", doc.metadata["source"])


  result = rag_chain(query)
  return forward_call(*args, **kwargs)



🤖 Bot: I'm sorry, I don't have enough information to understand your question. Can you please provide more context or clarify what you would like to know?

📄 Sources:
- https://www.changiairport.com/zh.html
- https://www.changiairport.com/zh/cookie-policy.html
- https://www.changiairport.com/en/corporate/partnering-us/advertising-and-sponsorship.html


  return forward_call(*args, **kwargs)



🤖 Bot: I don't have information on specific holiday plans offered by Changi Airport.

📄 Sources:
- https://www.changiairport.com/zh/help/changi-app/changi-pay/overseas-payment.html
- https://www.changiairport.com/en/corporate/partnering-us/airport-concessions/airport-retail.html
- https://www.changiairport.com/en/fly/transit-guide.html


  return forward_call(*args, **kwargs)



🤖 Bot: Changi Airport and Jewel offer a wide range of dining options, including home-grown eateries and international cuisines. You can easily make reservations through the Changi app for a comfortable dining experience. Additionally, there are deals available for discounts of up to 50% at selected restaurants. You can explore the dining options by scanning the QR code on the Changi app or by signing up for a Changi account to receive the latest updates.

📄 Sources:
- https://www.changiairport.com/en/help/changi-app/dine.html
- https://www.changiairport.com/en/fly/transit-guide.html
- https://www.jewelchangiairport.com/en/venue-hire.html


  return forward_call(*args, **kwargs)



🤖 Bot: Changi Airport offers a variety of entertainment options, including free movies, live performances, sports events, and more. Additionally, visitors can enjoy interactive games, immersive shows, and amazing projection experiences in the virtual world of Changi Airport. There are also 20 different interactive touchpoints with various content to explore and enjoy.

📄 Sources:
- https://www.changiairport.com/en/happenings/events-directory/sg60-outdoor-light-up.html?category=play-attractions
- https://www.changiairport.com/en/happenings/events-directory/sg60-outdoor-light-up.html
- https://www.jewelchangiairport.com/en/attractions/ces.html


  return forward_call(*args, **kwargs)



🤖 Bot: I'm sorry, I don't have enough context to understand your question. How can I assist you today?

📄 Sources:
- https://www.changiairport.com/zh.html
- https://www.changiairport.com/zh/cookie-policy.html
- https://www.changiairport.com/en/corporate/partnering-us/advertising-and-sponsorship.html
