<a href="https://colab.research.google.com/github/wang1091/SmartEarningCall/blob/main/build_vecstore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install boto3
!pip install langchain-community
!pip install langchain-huggingface
import os
import boto3
import hashlib
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from get_config import load_config  # ✨ 直接引入 config

def calculate_md5(file_path):
    """Calculate the MD5 checksum of a local file."""
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

def list_local_files(local_folder):
    """List all .txt files in a local folder and calculate their MD5 checksums."""
    local_files = {}
    if not os.path.exists(local_folder):
        return local_files  # Return empty if folder does not exist yet
    for filename in os.listdir(local_folder):
        if filename.endswith('.txt'):
            full_path = os.path.join(local_folder, filename)
            local_files[filename] = calculate_md5(full_path)
    return local_files

def list_s3_files(bucket_name, prefix, aws_access_key, aws_secret_key):
    """List all .txt files in an S3 bucket and get their ETag (MD5 checksum)."""
    s3 = boto3.client('s3',
                      aws_access_key_id=aws_access_key,
                      aws_secret_access_key=aws_secret_key)
    paginator = s3.get_paginator('list_objects_v2')
    s3_files = {}
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.endswith('.txt') and not key.endswith('/'):
                filename = os.path.basename(key)
                s3_files[filename] = obj['ETag'].strip('"')
    return s3_files

def download_s3_files(bucket_name, prefix, local_folder, aws_access_key, aws_secret_key):
    """Download all .txt files from an S3 bucket to a local folder."""
    os.makedirs(local_folder, exist_ok=True)
    s3 = boto3.client('s3',
                      aws_access_key_id=aws_access_key,
                      aws_secret_access_key=aws_secret_key)
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.endswith('.txt') and not key.endswith('/'):
                filename = os.path.basename(key)
                local_path = os.path.join(local_folder, filename)
                s3.download_file(bucket_name, key, local_path)
                print(f"✅ Downloaded {filename}")

def build_vectorstore(local_folder, save_path):
    """Build a FAISS vectorstore from .txt files in a local folder and save it."""
    all_docs = []
    for filename in os.listdir(local_folder):
        if filename.endswith('transcript.txt'):
            loader = TextLoader(os.path.join(local_folder, filename), encoding='utf-8')
            docs = loader.load()
            all_docs.extend(docs)

    splitter = CharacterTextSplitter(separator="\n\n", chunk_size=2000, chunk_overlap=100)
    chunks = splitter.split_documents(all_docs)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
    vectorstore = FAISS.from_documents(chunks, embeddings)
    vectorstore.save_local(save_path)
    print(f"✅ Vectorstore built and saved to {save_path}")

if __name__ == "__main__":
    # === Load Configuration ===
    config = load_config()

    bucket_name = config.get('bucket_name')
    prefix = config.get('prefix', 'earning_call_transcript/')
    local_folder = config.get('local_folder', 'data')
    save_path = config.get('save_path', 'faiss_index')
    aws_access_key = config.get('aws_access_key_id')
    aws_secret_key = config.get('aws_secret_access_key')

    # === Execution ===
    print("🔍 Checking for updates...")
    local_files = list_local_files(local_folder)
    s3_files = list_s3_files(bucket_name, prefix, aws_access_key, aws_secret_key)

    if local_files != s3_files:
        print("Detected changes. Downloading files and rebuilding vectorstore...")
        download_s3_files(bucket_name, prefix, local_folder, aws_access_key, aws_secret_key)
        build_vectorstore(local_folder, save_path)
    else:
        print("No changes detected. Skipping download and build.")
