## Module to use

In [1]:
from utils.Config import ChatConfig
from utils.aws_client import (
    AWSBedrockEembedding, 
    AWSS3Bucket
)
from src.backend import ChatBot
from src.SyncDBHandler import SyncS3DBHandler

## Different chunk_size, chunk_overlap, question


In [3]:
chunk_sizes = [100,200,300,400,500]
chunk_overlaps = [0,20,40,60,80,100]

import csv
# Open the CSV file
with open('./RAG_eval_data_60.csv', 'r',encoding='utf-8') as file:
    # Create a CSV reader
    csv_reader = csv.reader(file)
    
    # Skip the header row if it exists
    next(csv_reader, None)
    
    # Initialize an empty list to store the values from the desired column
    questions = []
    
    # Specify the column index (0 for the first column, 1 for the second, etc.)
    column_index = 0  # Change this to the desired column
    
    # Iterate through the rows and extract values from the specified column
    for row in csv_reader:
        questions.append(row[column_index])

print(questions)

['管理審查會議的召開頻率是多久一次?', '風險值的計算方式是什麼?', '資訊資產價值的評估標準有哪些?', '風險事件發生可能性的評分構面有哪些?', '超過可接受風險值的風險應該如何處理?', '資料與文件之儲存有哪些規定?', '資料傳遞時應遵循哪些規定?', '帳號申請應遵循哪些原則?', '密碼管理應遵循哪些原則?', '資料交換時需要進行哪些安全控管?', '資料交換的申請流程是什麼?', '資料交換發生安全事件時應該如何處理?', '資料交換有哪些禁止的方式?', '什麼是管制區域?', '資訊設備進出管制區域有哪些管理措施?', '什麼是弱點?', '什麼是可攜式儲存媒體?', '每季應執行什麼測試來確認備份資料的可用性?', '儲存媒體的存放環境應如何?', '報廢的儲存媒體應如何處理?', '什麼是「非軍事區網段」?', '「網路連線管理」措施是甚麼?', '資訊安全事件等級有哪兩種?', '發現資訊安全事件時,資訊技術處窗口應該先做哪些事情?', '資訊安全事件分類有幾種?分別是什麼?', '資訊安全事件發生後,資訊技術處窗口需要提供哪三項東西給事件處理單位?', '資訊安全事件處理原則中,事件處理單位需要做哪兩件事情?', '何謂營運衝擊分析?', '最大可容忍中斷時間的定義為何?', '資料復原時間目標的定義為何?', '資訊作業營運持續計畫應包含哪些內容?', '誰負責制定有效性量測指標?', '如何處理有效性量測值未達目標值的情況?', '有效性量測指標的定義是什麼?', '資訊安全自行查核小組之權責為何?', '資訊安全自行查核時機有哪些?', '查核工具之保護措施為何?', '資通系統之使用者帳號及密碼之管理措施,應依據哪個辦法進行?', '資通系統,指的是什麼?', '資通安全事件,是指什麼?', '資通系統開發完成後,在正式上線前應執行什麼動作?', '資訊安全執行小組的主要職責是什麼?', '台灣營運長室轄下單位與外包商簽訂的契約中應包含哪些內容?', '台灣營運長室轄下單位應遵循哪些資訊安全標準?', 'Tableau 帳號申請的流程是什麼?', 'Tableau 帳號密碼過期的週期是?', 'Tableau 使用者異動時,原主管或 PL 須在幾個日曆天內主動修改 Tableau 密碼?', '何謂 Oncall?', 

## Check point record

In [4]:
import json
from time import strftime
import os
def update_checkpoint(checkpoint_file, vectorstore):
    timestamp = strftime("%Y-%m-%d %H:%M:%S")
    completed_task = f"{vectorstore}:{timestamp}"
    try:
        if os.path.exists(checkpoint_file):
            with open(checkpoint_file, 'r') as file:
                completed_tasks = json.load(file)
        else:
            completed_tasks = []

        completed_tasks.append(completed_task)
        
        with open(checkpoint_file, 'w') as file:
            json.dump(completed_tasks, file)
    except Exception as e:
        print(f"Error updating checkpoint: {e}")

## Auto GEN retriever results to s3
```
Here use the _get_retriever() method to use retriever's get_relevant_documents()
```

In [7]:
import itertools
import gc
from time import time
import csv
import shutil
gc.collect()
checkpoint_file = 'processing_checkpoint.json'
aws_config = ChatConfig()
s3_bucket = AWSS3Bucket(aws_config).connect_to_cloud_storage()
embeddings = AWSBedrockEembedding(aws_config).get_embedding_model()
db_handler = SyncS3DBHandler(aws_s3_bucket=s3_bucket)

start = time()
for chunk_size, chunk_overlap in itertools.product(chunk_sizes, chunk_overlaps):
        csv_data = [['question','contexts','context']]
        output_directory = 'RAG_retriever_eval_60/'
        csv_file_path = os.path.join(output_directory, f'experiment_vectorstores_{chunk_size}_{chunk_overlap}.csv')
        
        config = ChatConfig(
                        vector_db_path=f'./Retrieval_eval_vectorstores/experiment_vectorstores_{chunk_size}_{chunk_overlap}',
                        prompt_db_path='./prompt',
                        prompt_name="rag_v1.md",
                        retriever_topk=15,
                )
        retriever = ChatBot(
                        config=config, 
                        DBHandler=db_handler, 
                        Embeddings=embeddings, 
                )._get_retriever()    
        
        for question in questions:    
                retrieve_result = retriever.get_relevant_documents(question)
                
                contexts = [doc.page_content for doc in retrieve_result]
                context = ''.join(contexts)
                csv_data.append([question, contexts, context])
        
        os.makedirs(output_directory, exist_ok=True)
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerows(csv_data)
        

        s3_bucket.upload_file(csv_file_path, csv_file_path)
        print(f"------------------- Sync: {csv_file_path} to Local: {csv_file_path}  Done-------------------")
        
        update_checkpoint(checkpoint_file, config.vector_db_path)

        del csv_data, retriever, config, context,contexts
        gc.collect()

        
        end = time()
        print(f"Completed processing for vector store ./experiment_vectorstores_{chunk_size}_{chunk_overlap} 。。。。。。。。。。。 time passed: {(end - start)/60} minutes")
        



INFO:botocore.credentials:Found credentials in environment variables.
INFO:botocore.credentials:Found credentials in environment variables.
INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_100_0
INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_100_20


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_100_0.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_100_0.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_100_0 。。。。。。。。。。。 time passed: 0.3370717247327169 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_100_40


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_100_20.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_100_20.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_100_20 。。。。。。。。。。。 time passed: 0.6435261567433676 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_100_60


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_100_40.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_100_40.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_100_40 。。。。。。。。。。。 time passed: 0.9672839244206747 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_100_80


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_100_60.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_100_60.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_100_60 。。。。。。。。。。。 time passed: 1.250582500298818 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_100_100


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_100_80.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_100_80.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_100_80 。。。。。。。。。。。 time passed: 1.535608959197998 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_200_0


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_100_100.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_100_100.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_100_100 。。。。。。。。。。。 time passed: 1.821083370844523 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_200_20


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_200_0.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_200_0.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_200_0 。。。。。。。。。。。 time passed: 2.1358901143074034 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_200_40


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_200_20.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_200_20.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_200_20 。。。。。。。。。。。 time passed: 2.439734240372976 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_200_60


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_200_40.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_200_40.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_200_40 。。。。。。。。。。。 time passed: 2.771740670998891 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_200_80


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_200_60.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_200_60.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_200_60 。。。。。。。。。。。 time passed: 3.0729869325955708 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_200_100


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_200_80.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_200_80.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_200_80 。。。。。。。。。。。 time passed: 3.3739933411280316 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_300_0


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_200_100.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_200_100.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_200_100 。。。。。。。。。。。 time passed: 3.6951995650927225 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_300_20


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_300_0.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_300_0.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_300_0 。。。。。。。。。。。 time passed: 4.016998648643494 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_300_40


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_300_20.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_300_20.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_300_20 。。。。。。。。。。。 time passed: 4.388322114944458 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_300_60


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_300_40.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_300_40.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_300_40 。。。。。。。。。。。 time passed: 4.745927548408508 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_300_80


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_300_60.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_300_60.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_300_60 。。。。。。。。。。。 time passed: 5.071174240112304 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_300_100


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_300_80.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_300_80.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_300_80 。。。。。。。。。。。 time passed: 5.414012845357259 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_400_0


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_300_100.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_300_100.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_300_100 。。。。。。。。。。。 time passed: 5.742021799087524 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_400_20


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_400_0.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_400_0.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_400_0 。。。。。。。。。。。 time passed: 6.073085220654805 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_400_40


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_400_20.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_400_20.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_400_20 。。。。。。。。。。。 time passed: 6.406699721018473 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_400_60


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_400_40.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_400_40.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_400_40 。。。。。。。。。。。 time passed: 6.745708262920379 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_400_80


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_400_60.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_400_60.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_400_60 。。。。。。。。。。。 time passed: 7.081131664911906 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_400_100


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_400_80.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_400_80.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_400_80 。。。。。。。。。。。 time passed: 7.408817386627197 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_500_0


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_400_100.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_400_100.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_400_100 。。。。。。。。。。。 time passed: 7.743059925238291 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_500_20


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_500_0.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_500_0.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_500_0 。。。。。。。。。。。 time passed: 8.093995209534963 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_500_40


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_500_20.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_500_20.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_500_20 。。。。。。。。。。。 time passed: 8.471037193139393 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_500_60


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_500_40.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_500_40.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_500_40 。。。。。。。。。。。 time passed: 8.821655146280925 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_500_80


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_500_60.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_500_60.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_500_60 。。。。。。。。。。。 time passed: 9.165958940982819 minutes


INFO:root:With Prompt: rag_v1.md
INFO:root:With vector Store: ./Retrieval_eval_vectorstores/experiment_vectorstores_500_100


------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_500_80.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_500_80.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_500_80 。。。。。。。。。。。 time passed: 9.50953840414683 minutes
------------------- Sync: RAG_retriever_eval_60/experiment_vectorstores_500_100.csv to Local: RAG_retriever_eval_60/experiment_vectorstores_500_100.csv  Done-------------------
Completed processing for vector store ./experiment_vectorstores_500_100 。。。。。。。。。。。 time passed: 9.863369651635487 minutes
