# 1. Import Required Modules

In [39]:
# !pip install langchain_community
import os
import random

from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import concurrent.futures

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
import matplotlib.pyplot as plt
import openai

import re
from concurrent.futures import ThreadPoolExecutor

from collections import Counter

# 2. Crawl data

In [9]:
os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"

openai.api_key = os.environ.get("OPENAI_API_KEY")
def clean_text(text):
    import re
    if not text:
        return text

    # Loại bỏ emoji
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002700-\U000027BF"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub('', text)

    # Loại bỏ các ký tự không mong muốn
    text = re.sub(r'█+', '', text)
    text = re.sub(r'-{3,}', '', text)
    text = re.sub(r',{2,}', ',', text)
    text = re.sub(r'\|+', '', text)

    # Làm sạch khoảng trắng trừ newline: chỉ thay khoảng trắng thừa, KHÔNG đụng \n
    # Đổi nhiều khoảng trắng liên tiếp thành 1, giữ lại \n và \n\n
    text = re.sub(r'[ \t\f\v\r]+', ' ', text)  # chỉ gom space/tab/dạng khác, KHÔNG gom \n

    # Chuẩn hóa xuống dòng: bỏ nhiều \n liên tiếp thành \n\n (giữ paragraph)
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Xóa space dư ở đầu/cuối dòng
    text = re.sub(r' *\n *', '\n', text)

    return text.strip()

def fetch_page(url: str) -> str:
    response = requests.get(url)
    if response.status_code == 200:
        return response.text

    return ""


def extract_links_from_page(url: str, base_url="https://docs.ray.io/en/latest") -> list[str]:
    html_content = fetch_page(url)
    if html_content == "":
        return []

    soup = BeautifulSoup(html_content, "html.parser")
    links = []
    toc_div = soup.find("ul", class_="nav bd-sidenav")
    if not toc_div:
        return []
    for a in toc_div.find_all("a", href=True):
        href: str = a["href"]  # type: ignore
        if href.startswith("./"):
            full_url = base_url + href[1:]
            if full_url not in links:
                links.append(full_url)
    return links


# def load_and_process_url(url: str) -> list[Document]:
#     try:
#         loader = WebBaseLoader(url)
#         documents = loader.load()

#         for doc in documents:
#             doc.metadata.update({"source": url, "source_type": "ray_documentation"})

#         return documents
#     except Exception as e:
#         print(f"Error processing {url}: {e}")
#         return []

def load_and_process_url_v1(url: str) -> list[Document]:
    """Sử dụng WebBaseLoader với SoupStrainer để chỉ parse phần cần thiết"""
    try:
        # Chỉ parse thẻ article với class="bd-article"
        from bs4 import SoupStrainer
        parse_only = SoupStrainer("article", class_="bd-article")

        loader = WebBaseLoader(
            url,
            bs_kwargs={
                "parse_only": parse_only,
                # "features": "html.parser"
            }
        )
        documents = loader.load()

        for doc in documents:
            doc.metadata.update({
                "source": url,
                "source_type": "ray_documentation",
                "extraction_method": "article_only"
            })

        return documents
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return []
def clean_documents(documents):
    """
    Làm sạch list các Document objects
    """
    cleaned_documents = []

    for doc in documents:
        # Làm sạch page_content
        original_content = doc.page_content
        cleaned_content = clean_text(original_content)

        # Chỉ giữ lại document nếu còn nội dung có ý nghĩa
        if cleaned_content and len(cleaned_content.strip()) > 50:
            # Tạo document mới với nội dung đã làm sạch
            cleaned_doc = type(doc)(
                page_content=cleaned_content,
                metadata=doc.metadata.copy() if hasattr(doc, 'metadata') else {}
            )
            cleaned_documents.append(cleaned_doc)

    return cleaned_documents

def chunk_documents(
    documents: list[Document], chunk_size: int = 1000, chunk_overlap: int = 200
) -> list[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    cleaned_documents = clean_documents(documents)
    chunked_documents = text_splitter.split_documents(cleaned_documents)
    return chunked_documents


def process_documents(
    start_url: str = "https://docs.ray.io/en/latest/",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    max_urls: int = 700,
    min_length: int = 50,
    sample_size: int | None = None,
) -> list[Document]:
    """Full pipeline: fetch docs, convert to Langchain docs, chunk"""

    print(f"Extracting links from {start_url}")
    doc_links = extract_links_from_page(start_url)

    if max_urls and len(doc_links) > max_urls:
        doc_links = doc_links[:max_urls]

    print(f"Found {len(doc_links)} links, start extracting docs & processing...")
    all_docs = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {
            executor.submit(load_and_process_url_v1, url): url for url in doc_links
        }
        for future in tqdm(
            concurrent.futures.as_completed(future_to_url),
            total=len(doc_links),
            desc="Loading documents",
        ):
            url = future_to_url[future]
            try:
                docs = future.result()
                all_docs.extend(docs)
            except Exception as e:
                print(f"{url} generaated an exception: {e}")

    # Tạo thư mục để lưu files
    output_dir = r"/content/all_docs"
    os.makedirs(output_dir, exist_ok=True)

    def sanitize_filename(filename):
        """Làm sạch tên file để tránh các ký tự không hợp lệ"""
        # Nếu là URL, chỉ lấy phần sau "latest"
        if 'latest/' in filename:
            filename = filename.split('latest/')[-1]

        # Loại bỏ extension .html nếu có
        if filename.endswith('.html'):
            filename = filename[:-5]

        # Thay thế slash bằng underscore
        filename = filename.replace('/', '_')

        # Thay thế các ký tự không hợp lệ bằng underscore
        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)

        # Loại bỏ khoảng trắng thừa và thay bằng underscore
        filename = re.sub(r'\s+', '_', filename.strip())

        # Giới hạn độ dài filename
        if len(filename) > 200:
            filename = filename[:200]

        return filename

    # Lưu từng document thành file txt
    for i, doc in enumerate(all_docs):
        try:
            # Lấy source từ metadata
            source = doc.metadata.get('source', f'document_{i}')

            # Làm sạch tên file
            filename = sanitize_filename(source)

            # Đảm bảo có extension .txt
            if not filename.endswith('.txt'):
                filename += '.txt'

            # Đường dẫn đầy đủ
            filepath = os.path.join(output_dir, filename)

            # Nếu file đã tồn tại, thêm số thứ tự
            counter = 1
            original_filepath = filepath
            while os.path.exists(filepath):
                name, ext = os.path.splitext(original_filepath)
                filepath = f"{name}_{counter}{ext}"
                counter += 1

            # Lưu nội dung vào file
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(doc.page_content)

            print(f"Đã lưu: {filepath}")

        except Exception as e:
            print(f"Lỗi khi lưu document {i}: {e}")

    print(f"Đã hoàn thành! Tổng cộng {len(all_docs)} documents được lưu trong thư mục '{output_dir}'")


    print(f"Loaded {len(all_docs)} documents")

    # Chunking step
    print(
        f"Chunking documents with chunk_size={chunk_size}, overlap={chunk_overlap}..."
    )
    chunked_docs = chunk_documents(all_docs, chunk_size, chunk_overlap)

    print(f"Created {len(chunked_docs)} documents")

    # Filter by length
    filtered_docs = [
        doc for doc in chunked_docs if len(doc.page_content.split()) >= min_length
    ]

    print(f"Filtered to {len(filtered_docs)} chunks with at least {min_length} words")

    # # Sample if needed
    # if sample_size and sample_size < len(filtered_docs):
    #     docs_to_process = random.sample(filtered_docs, sample_size)
    #     print(f"Sampled {sample_size} chunks for processing")
    # else:
    #     docs_to_process = filtered_docs
    docs_to_process = filtered_docs
    return docs_to_process


# if __name__ == "__main__":
#     documents = extract_links_from_page("https://docs.ray.io/en/latest")
#     print(f"Final result: {documents} processed documents")


In [10]:
docs_to_process = process_documents()


Extracting links from https://docs.ray.io/en/latest/
Found 605 links, start extracting docs & processing...


Loading documents: 100%|██████████| 605/605 [03:23<00:00,  2.97it/s]


Đã lưu: /content/all_docs/ray-overview_index.txt
Đã lưu: /content/all_docs/ray-air_getting-started.txt
Đã lưu: /content/all_docs/ray-overview_use-cases.txt
Đã lưu: /content/all_docs/ray-overview_ray-libraries.txt
Đã lưu: /content/all_docs/ray-overview_examples.txt
Đã lưu: /content/all_docs/ray-overview_installation.txt
Đã lưu: /content/all_docs/ray-overview_getting-started.txt
Đã lưu: /content/all_docs/ray-core_walkthrough.txt
Đã lưu: /content/all_docs/ray-core_key-concepts.txt
Đã lưu: /content/all_docs/ray-core_tasks_nested-tasks.txt
Đã lưu: /content/all_docs/ray-core_tasks.txt
Đã lưu: /content/all_docs/ray-core_user-guide.txt
Đã lưu: /content/all_docs/ray-core_tasks_generators.txt
Đã lưu: /content/all_docs/ray-core_actors.txt
Đã lưu: /content/all_docs/ray-core_actors_terminating-actors.txt
Đã lưu: /content/all_docs/ray-core_actors_async_api.txt
Đã lưu: /content/all_docs/ray-core_actors_named-actors.txt
Đã lưu: /content/all_docs/ray-core_actors_concurrency_group_api.txt
Đã lưu: /conte

In [11]:
len(docs_to_process)

4670

In [12]:


# Sau khi có docs_to_process
# Tạo dữ liệu cho DataFrame
data = []
for i, doc in enumerate(docs_to_process):
    data.append({
        'global_chunk_id': i,  # ID bắt đầu từ 0
        'text': doc.page_content
    })

# Tạo DataFrame
df = pd.DataFrame(data)

# Lưu thành file CSV
df.to_csv('chunks_data_cleaned.csv', index=False, encoding='utf-8')

print(f"Đã lưu {len(docs_to_process)} chunks vào file 'chunks_data.csv'")
print(f"Cấu trúc dữ liệu:")
print(df.head())

Đã lưu 4670 chunks vào file 'chunks_data.csv'
Cấu trúc dữ liệu:
   global_chunk_id                                               text
0                0  Overview#\nRay is an open-source unified frame...
1                1  For data scientists and machine learning pract...
2                2  For distributed systems engineers, Ray automat...
3                3  Ray framework#\n\nStack of Ray libraries - uni...
4                4  Ray AI Libraries\n\nBuild distributed applicat...


## 2.1 Explore Data and clean data

In [40]:
df= pd.read_csv(r'chunks_data_cleaned.csv')

In [41]:
df

Unnamed: 0,global_chunk_id,text
0,0,Overview#\nRay is an open-source unified frame...
1,1,For data scientists and machine learning pract...
2,2,"For distributed systems engineers, Ray automat..."
3,3,Ray framework#\n\nStack of Ray libraries - uni...
4,4,Ray AI Libraries\n\nBuild distributed applicat...
...,...,...
4665,4665,Synchronous sampling#Sampling workers work in ...
4666,4666,Trainable#A Trainable is the interface that Ra...
4667,4667,Trainer#A Trainer is the top-level API to conf...
4668,4668,Training iteration#A partial training pass of ...


In [42]:
df.info()  # Thông tin tổng quan


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4670 entries, 0 to 4669
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   global_chunk_id  4670 non-null   int64 
 1   text             4670 non-null   object
dtypes: int64(1), object(1)
memory usage: 73.1+ KB


In [43]:
 # 5. Phân tích cột text
text_cols = df.select_dtypes(include=['object']).columns
if len(text_cols) > 0:

    for col in text_cols:
        print(f"\nCột '{col}':")

        # Độ dài text
        lengths = df[col].astype(str).str.len()
        print(f"  Độ dài: min={lengths.min()}, max={lengths.max()}, mean={lengths.mean():.1f}")

        # Top values phổ biến nhất
        print(f"  Top 5 giá trị phổ biến:")
        top_values = df[col].value_counts().head(5)
        for val, count in top_values.items():
            display_val = str(val)[:50] + "..." if len(str(val)) > 50 else str(val)
            print(f"    '{display_val}' (xuất hiện {count} lần)")




Cột 'text':
  Độ dài: min=230, max=1000, mean=827.3
  Top 5 giá trị phổ biến:
    '# Tell the autoscaler the allowed node types and t...' (xuất hiện 3 lần)
    '# For more documentation on available fields, see
...' (xuất hiện 3 lần)
    'Note that the self.METADATA_FILE_NAME file is not ...' (xuất hiện 3 lần)
    'The main logic is to loop through all subcomponent...' (xuất hiện 3 lần)
    'Parameters:

path – The path to the directory to s...' (xuất hiện 3 lần)


In [44]:
total_duplicates = df['text'].duplicated().sum()
total_duplicates

np.int64(58)

In [45]:
total_duplicates = df['global_chunk_id'].duplicated().sum()
total_duplicates

np.int64(0)

In [46]:
print(df.to_string())

      global_chunk_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [47]:
df.drop_duplicates(subset='text', keep='first', inplace=True)

In [48]:
total_duplicates = df['text'].duplicated().sum()
total_duplicates

np.int64(0)

In [49]:
text_columns = df.select_dtypes(include=['object']).columns

# Định nghĩa các pattern cần tìm
patterns = {
    'Block characters (█)': r'█+',
    'Multiple pipes (|||)': r'\|{2,}',
    'Multiple commas (,,,)': r',{3,}',
    'Multiple dashes (---)': r'-{5,}',
    'Emoji': r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]',
    'Special symbols': r'[💪✨🤗🎉]',
}

total_issues = 0

for col in text_columns:
    print(f"\nCột '{col}':")
    col_issues = 0

    for pattern_name, pattern in patterns.items():
        matches = df[col].astype(str).str.contains(pattern, na=False, regex=True)
        count = matches.sum()

        if count > 0:
            print(f"  {pattern_name}: {count} hàng ({count/len(df)*100:.1f}%)")
            col_issues += count
            total_issues += count

    if col_issues == 0:
        print(f"  ✅ Không phát hiện vấn đề")

print(f"\nTổng cộng: {total_issues} vấn đề được phát hiện")



Cột 'text':
  ✅ Không phát hiện vấn đề

Tổng cộng: 0 vấn đề được phát hiện


In [52]:
df.head()  # 5 dòng đầu


Unnamed: 0,global_chunk_id,text
0,0,Overview#\nRay is an open-source unified frame...
1,1,For data scientists and machine learning pract...
2,2,"For distributed systems engineers, Ray automat..."
3,3,Ray framework#\n\nStack of Ray libraries - uni...
4,4,Ray AI Libraries\n\nBuild distributed applicat...


In [53]:
df.describe()  # Thống kê mô tả


Unnamed: 0,global_chunk_id
count,4612.0
mean,2321.452082
std,1350.875982
min,0.0
25%,1153.75
50%,2306.5
75%,3509.25
max,4669.0


In [59]:
df.reset_index(drop=True, inplace=True)
df['global_chunk_id'] = df.index + 1  # bắt đầu từ 1

df

Unnamed: 0,global_chunk_id,text
0,1,Overview#\nRay is an open-source unified frame...
1,2,For data scientists and machine learning pract...
2,3,"For distributed systems engineers, Ray automat..."
3,4,Ray framework#\n\nStack of Ray libraries - uni...
4,5,Ray AI Libraries\n\nBuild distributed applicat...
...,...,...
4607,4608,Synchronous sampling#Sampling workers work in ...
4608,4609,Trainable#A Trainable is the interface that Ra...
4609,4610,Trainer#A Trainer is the top-level API to conf...
4610,4611,Training iteration#A partial training pass of ...


In [60]:
df.to_csv('chunks_data_more_cleaned.csv', index=False, encoding='utf-8')


In [55]:
document1= load_and_process_url_v1("https://docs.ray.io/en/latest/ray-overview/installation.html")

In [56]:
document1

[Document(metadata={'source': 'https://docs.ray.io/en/latest/ray-overview/installation.html', 'source_type': 'ray_documentation', 'extraction_method': 'article_only'}, page_content='\n\nInstalling Ray#\n\n\n\nRay currently officially supports x86_64, aarch64 (ARM) for Linux, and Apple silicon (M1) hardware.\nRay on Windows is currently in beta.\n\nOfficial Releases#\n\nFrom Wheels#\nYou can install the latest official version of Ray from PyPI on Linux, Windows,\nand macOS by choosing the option that best matches your use case.\n\n\n\nRecommended\nFor machine learning applications\npip install -U "ray[data,train,tune,serve]"\n\n# For reinforcement learning support, install RLlib instead.\n# pip install -U "ray[rllib]"\n\n\nFor general Python applications\npip install -U "ray[default]"\n\n# If you don\'t want Ray Dashboard or Cluster Launcher, install Ray with minimal dependencies instead.\n# pip install -U "ray"\n\n\n\n\n\nAdvanced\n\n\n\n\n\n\nCommand\nInstalled components\n\n\n\npip i