In [1]:
from tqdm import tqdm


def parse_dialogue_file(file_path: str, numlines: int = None):
    parsed_dialogues = []

    with open(file_path, 'r') as file:
        for i, line in enumerate(file):    
            if numlines is not None and i == numlines:  break
            turns = line.strip().split('__eou__')
            turns = [turn.strip() for turn in turns if turn.strip()]
            parsed_dialogues.append(turns)

    return parsed_dialogues

def make_chunks(dialogues, chunk_size=8, padding=2):
    all_sentences = [' ']*padding + [sentence for line in dialogues for sentence in line] + [' ']*padding
    return [
        ' '.join(all_sentences[i - padding:i + chunk_size + padding])
        for i in range(padding, len(all_sentences)+padding, chunk_size)
    ]


In [2]:
#  encode all data
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L12-cos-v5", trust_remote_code=True)
embed_model_dims = embed_model.get_sentence_embedding_dimension()

def encode_chunks(chunks):
    embeddings = embed_model.encode(chunks)
    return embeddings.tolist()

file_path = 'dialogues_train.txt'
parsed = parse_dialogue_file(file_path)
chunks = make_chunks(parsed)


  from tqdm.autonotebook import tqdm, trange


In [3]:

# push data to db
import os
import sys
from datetime import datetime
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), 'third_party', 'tidb-vector-python')))

from tidb_vector.integrations import TiDBVectorClient
from dotenv import load_dotenv

# Load the connection string from the .env file
load_dotenv()

vector_store = TiDBVectorClient(
   # The table which will store the vector data.
   table_name='embedded_documents',
   # The connection string to the TiDB cluster.
   connection_string=os.environ.get('TIDB_DATABASE_URL'),
   # The dimension of the vector generated by the embedding model.
   vector_dimension=embed_model_dims,
   # Determine whether to recreate the table if it already exists.
#    drop_existing_table=True,
)

count = 0
for i,chunk in tqdm(enumerate(chunks),total=len(chunks)):
    idx = i % (len(chunks)//10)
    data.append({"embedding": encode_chunks(chunks=chunk), "text": chunk})
    
    if idx == 0 and i != 0:
        vector_store.insert(
            embeddings=[d["embedding"] for d in data],
            texts=[d["text"] for d in data],
        )
        print(f"chunk {count} upload to server")
        data = []
        count += 1
print("upload the remaining...")
vector_store.insert(
   embeddings=[d["embedding"] for d in data],
   texts=[d["text"] for d in data],
)
print("done")
        

 10%|█         | 1093/10897 [01:37<2:33:01,  1.07it/s]

chunk 0 upload to server


 20%|██        | 2183/10897 [03:12<1:56:44,  1.24it/s]

chunk 1 upload to server


 30%|███       | 3270/10897 [04:56<2:05:56,  1.01it/s]

chunk 2 upload to server


 40%|████      | 4360/10897 [06:41<1:27:19,  1.25it/s]

chunk 3 upload to server


 50%|█████     | 5449/10897 [08:01<1:13:15,  1.24it/s]

chunk 4 upload to server


 60%|█████▉    | 6538/10897 [09:16<1:03:00,  1.15it/s]

chunk 5 upload to server


 70%|██████▉   | 7627/10897 [10:51<44:06,  1.24it/s]  

chunk 6 upload to server


 80%|███████▉  | 8716/10897 [12:28<33:34,  1.08it/s]

chunk 7 upload to server


 90%|████████▉ | 9804/10897 [14:04<17:31,  1.04it/s]

chunk 8 upload to server


100%|█████████▉| 10895/10897 [15:45<00:01,  1.22it/s]

chunk 9 upload to server


100%|██████████| 10897/10897 [15:45<00:00, 11.52it/s]


upload the remaining...
done


In [10]:
import time

# push data to db
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), 'third_party', 'tidb-vector-python')))

from tidb_vector.integrations import TiDBVectorClient
from dotenv import load_dotenv

# Load the connection string from the .env file
load_dotenv()

vector_store = TiDBVectorClient(
   # The table which will store the vector data.
   table_name='embedded_documents',
   # The connection string to the TiDB cluster.
   connection_string=os.environ.get('TIDB_DATABASE_URL'),
   # The dimension of the vector generated by the embedding model.
   vector_dimension=embed_model_dims,
   # Determine whether to recreate the table if it already exists.
#    drop_existing_table=True,
)

def print_result(query, result):
   print(f"Search result (\"{query}\"):")
   for r in result:
      print(f"- text: \"{r.document}\", distance: {r.distance}")

query = "Say , Jim , how about going for a few beers after dinner?"
query_embedding = encode_chunks(query)
now = time.time()
search_result = vector_store.query(query_embedding, k=3)
print(f"respond in {time.time()-now}")
print_result(query, search_result)

respond in 0.20792269706726074
Search result ("Jim, do you remeber the time i ask you out for beer?"):
- text: "What's wrong ? There's a girl in my company that I really like but I always get shy when she is around . I see ! Do you want to ask her out ? Sure , but how ? You can ask her out for drinks after work . But for what reasons ? She doesn't even know who I am . Then you've got a lot of work to do . You need to get her to notice you first . Easier said than done . You can start by meeting her at the bus stop and saying ' hello ' to her . But I always get tongue-tied when I see her . That's something you need to overcome . Men should make the first move as most girls prefer being chased . I see . I'll try .", distance: 0.479329700310144
- text: "Sure , but how ? You can ask her out for drink after work . But for what reasons ? She doesn't even know who I am . Then you've got a lot of homework to do . You need to get her notice first . Easier said than done . You can start by meeti