<a href="https://colab.research.google.com/github/tangYang7/GAI/blob/main/exercise/week8a.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

暫時沒有任何資料，可以用[政大學術性社團列表](https://raw.githubusercontent.com/yenlung/AI-Demo/refs/heads/master/data/nccu_academic_clubs_complete.txt)來練習。

### 1. 建立資料夾

In [None]:
import os
import requests

url = "https://raw.githubusercontent.com/yenlung/AI-Demo/refs/heads/master/data/nccu_academic_clubs_complete.txt"  # 替換成你的 txt 檔案連結

upload_dir = "uploaded_docs"
save_path = "政大社團列表.txt"  # 替換成你想要儲存的檔案名稱

os.makedirs(upload_dir, exist_ok=True)


try:
    response = requests.get(url)
    response.raise_for_status()

    with open(f'{upload_dir}/{save_path}', 'w', encoding='utf-8') as file:
        file.write(response.text)

    print(f"檔案已成功下載並儲存到: {save_path}")

except requests.exceptions.RequestException as e:
    print(f"下載時發生錯誤: {e}")
except IOError as e:
    print(f"儲存檔案時發生錯誤: {e}")

### 2. 更新必要套件並引入

In [None]:
!pip install -U langchain langchain-community pypdf python-docx sentence-transformers faiss-cpu

In [None]:
from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

### 3. 依 e5 建議加入

自訂支援 E5 的 embedding 模型（加上 "passage:" / "query:" 前綴）

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

class CustomE5Embedding(HuggingFaceEmbeddings):
    def embed_documents(self, texts):
        texts = [f"passage: {t}" for t in texts]
        return super().embed_documents(texts)

    def embed_query(self, text):
        return super().embed_query(f"query: {text}")

### 4. 載入文件

In [None]:
folder_path = upload_dir
documents = []
for file in os.listdir(folder_path):
    path = os.path.join(folder_path, file)
    if file.endswith(".txt"):
        loader = TextLoader(path)
    elif file.endswith(".pdf"):
        loader = PyPDFLoader(path)
    elif file.endswith(".docx"):
        loader = UnstructuredWordDocumentLoader(path)
    else:
        continue
    documents.extend(loader.load())

### 5. 建立向量資料庫

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
split_docs = splitter.split_documents(documents)

In [None]:
embedding_model = CustomE5Embedding(model_name="intfloat/multilingual-e5-small")
vectorstore = FAISS.from_documents(split_docs, embedding_model)

### 6. 儲存向量資料庫

In [None]:
vectorstore.save_local("faiss_db")

In [None]:
!zip -r faiss_db.zip faiss_db

In [None]:
print("✅ 壓縮好的向量資料庫已儲存為 'faiss_db.zip'，請下載此檔案備份。")