In [1]:
from glob import glob
from tqdm.auto import tqdm

In [2]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
from ftfy import fix_text
import os
import json
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")




def hansard_to_docs(hansard_filepaths, max_chunk_length=800):
    counter = 1
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(encoding_name='cl100k_base',
            chunk_size=max_chunk_length,                                                         
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        
    doc_chunks = []

    for hansard_fp in tqdm(hansard_filepaths):
        filename_truncated = hansard_fp[:-6].split(os.path.sep)[-1]
        with open(hansard_fp, 'r') as f:
            date = hansard_fp[-16:-6]
            lines = f.readlines()
            temp_chunks = []
            chunk_length = 0
            for l in lines:
                hansard_info = json.loads(l)
                speaker = hansard_info['speaker']
                text = hansard_info['text']
                
                content = date + ':\t' + speaker + ':\t' + text
                content_length = len(enc.encode(content))
                
                # split large chunks
                if content_length > max_chunk_length:
                    splits = text_splitter.split_text(content)
                    
                    for split_no, s in enumerate(splits):
                        source_key = 'C' + "{:05d}".format(counter)
                        if split_no > 0:
                            s = date + ':\t' + speaker + ':\t' + s
                        split_length = len(enc.encode(s))
                        doc = Document(page_content=s, metadata={"filename": filename_truncated, "length": split_length, "clause_no": '', "split_no": str(split_no), "source": source_key})
                        doc_chunks.append(doc)
                        
                        # print('[add chunk split]:', s)

                        
                        counter +=1

                    
                else:
                    # add to chunks until max length is reached
                    if content_length + chunk_length <= max_chunk_length:
                        temp_chunks.append(hansard_info)
                        chunk_length += content_length
                        
                    else:
                        # write as document if chunk size exceeded
                        if temp_chunks:
                            source_key = 'C' + "{:05d}".format(counter)
                            merged_chunks_text = '\n\n'.join([date + ': ' + t['speaker'] + ': ' + t['text'] for t in temp_chunks])
                            split_length = len(enc.encode(merged_chunks_text))
                            doc = Document(page_content=merged_chunks_text, metadata={"filename": filename_truncated, "length": split_length, "clause_no": '', "split_no": 0, "source": source_key})
                            doc_chunks.append(doc)
                            
                            
                            # print('[add chunk join]:', merged_chunks_text)

                            
                            temp_chunks = []
                            chunk_length = 0
                            counter +=1
                            
                
            # write out residual chunks
            if temp_chunks:
                source_key = 'C' + "{:05d}".format(counter)
                merged_chunks_text = '\n\n'.join([date + ': ' + t['speaker'] + ': ' + t['text'] for t in temp_chunks])
                split_length = len(enc.encode(merged_chunks_text))
                doc = Document(page_content=merged_chunks_text, metadata={"filename": filename_truncated, "length": split_length, "clause_no": '', "split_no": 0, "source": source_key})
                doc_chunks.append(doc)

                # print('[add chunk residual]:', merged_chunks_text)

                                                                                        
    return doc_chunks

In [5]:
hansard_fps_2020s = glob('/home/watsonchua/work/others/hansard_extract_parse/sessions_by_sitting/sitting_202*.jsonl')

In [6]:
hansard_chunks = hansard_to_docs(hansard_fps_2020s, max_chunk_length=600)

  0%|          | 0/97 [00:00<?, ?it/s]

In [8]:
sorted([l.metadata['length'] for l in hansard_chunks], reverse=True)[:10]

[599, 599, 598, 598, 598, 597, 597, 597, 597, 597]

In [9]:
from langchain.embeddings.openai import OpenAIEmbeddings
import toml, openai
from langchain.vectorstores.faiss import FAISS

secrets = toml.load('/home/watsonchua/work/im_question_answering/.streamlit/secrets.toml')

openai_api_key = secrets['openai_api_key_azure']
openai.api_key = openai_api_key
openai.api_type = "azure"
openai.api_base = "https://govtext-ds-experiment.openai.azure.com/"
openai.api_version = "2023-03-15-preview"
# azure_completion_engine = "text-davinci-003-pretrained"
azure_completion_engine = "gpt-35-turbo"
azure_embedding_engine = "text-embedding-ada-002"

oai_embedder = OpenAIEmbeddings(query_model_name=azure_embedding_engine, document_model_name=azure_embedding_engine, openai_api_key=openai_api_key, chunk_size=1)

In [10]:
db = FAISS.from_documents(hansard_chunks, oai_embedder)
db.save_local('./hansard_docs_2020s')