In [18]:
#from google.colab import files
#files.upload()

In [6]:
!pip install -U -q PyDrive

In [8]:
import requests
from bs4 import BeautifulSoup
import re
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import files # Import files for uploading

# Function to extract text from a URL
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Remove scripts and styles
    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text(separator=" ")  # Get plain text
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    return text

# TfL Website URL
tfl_url = "https://tfl.gov.uk"
tfl_text = extract_text_from_url(tfl_url)

# Save text to file
file_path = "/content/drive/MyDrive/APPLICATION/TfL_Traffic_Data/tfl_raw_text.txt"
with open(file_path, "w", encoding="utf-8") as file:
    file.write(tfl_text)

print(f"✅ Text extracted and saved as {file_path}")



✅ Text extracted and saved as /content/drive/MyDrive/APPLICATION/TfL_Traffic_Data/tfl_raw_text.txt


## Data Preprocessing

In [9]:
import os
import re
import json
import nltk
from nltk.tokenize import sent_tokenize
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.colab import drive  # Use only if running in Google Colab

# Define file paths
drive_path = "/content/drive/MyDrive/APPLICATION/TfL_Traffic_Data/tfl_raw_text.txt"
output_path = "/content/drive/MyDrive/APPLICATION/TfL_Traffic_Data/tfl_cleaned_chunks.json"

# Step 2: Load raw TfL text data
with open(drive_path, "r", encoding="utf-8") as file:
    raw_text = file.read()




In [10]:
drive_path

'/content/drive/MyDrive/APPLICATION/TfL_Traffic_Data/tfl_raw_text.txt'

In [11]:
raw_text

"Keeping London moving - Transport for London Skip to navigation Skip to content Skip to footer Transport for London Search the site Fares Help & contacts Maps Plan a journey Status updates Travel information Improvements & projects Safety Stations, stops & piers Timetables Transport accessibility Visiting London Ways to get around Buses Coaches Cycling Dial-a-Ride DLR Driving Elizabeth line IFS Cloud Cable Car London Overground River Taxis & minicabs Trams Tube Walking Quick links Contactless and Oyster account Refunds and replacements Pay to drive in London Travel tools Plan a journey Journey preferences From location To location Leaving Arriving Date of departure Today Tomorrow Tue 04 Feb Wed 05 Feb Thu 06 Feb Fri 07 Feb Sat 08 Feb Sun 09 Feb Mon 10 Feb Tue 11 Feb Wed 12 Feb Thu 13 Feb Fri 14 Feb Sat 15 Feb Sun 16 Feb Mon 17 Feb Tue 18 Feb Wed 19 Feb Thu 20 Feb Fri 21 Feb Sat 22 Feb Sun 23 Feb Mon 24 Feb Tue 25 Feb Wed 26 Feb Thu 27 Feb Fri 28 Feb Sat 01 Mar Sun 02 Mar Mon 03 Mar Ti

In [15]:
# Step 3: Clean the text
import nltk
nltk.download('punkt')  # Download tokenizer if not already available
nltk.download('punkt_tab') # Download punkt_tab for sentence tokenization

clean_text = re.sub(r"http[s]?://\S+", "", raw_text)  # Remove links
clean_text = re.sub(r"\s+", " ", clean_text)  # Normalize whitespace
clean_text = re.sub(r"[^\w\s.,!?-]", "", clean_text)  # Remove unwanted symbols

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [16]:
# Step 4: Tokenize into sentences
sentences = sent_tokenize(clean_text)
# Step 5: Chunking into retrievable sections (512-1024 tokens per chunk)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = text_splitter.split_text("\n".join(sentences))




In [17]:
# Step 6: Save processed data as JSON for embedding
with open(output_path, "w", encoding="utf-8") as outfile:
    json.dump(chunks, outfile, indent=2)

print(f"✅ Preprocessing complete! {len(chunks)} chunks saved to Google Drive at: {output_path}")

✅ Preprocessing complete! 23 chunks saved to Google Drive at: /content/drive/MyDrive/APPLICATION/TfL_Traffic_Data/tfl_cleaned_chunks.json
