In [77]:
def parse_dialogue_file(file_path: str, numlines: int = None):
    parsed_dialogues = []

    with open(file_path, 'r') as file:
        for i, line in enumerate(file):    
            if numlines is not None and i == numlines:  break
            turns = line.strip().split('__eou__')
            turns = [turn.strip() for turn in turns if turn.strip()]
            parsed_dialogues.append(turns)

    return parsed_dialogues

def make_chunks(dialogues, chunk_size=8, padding=2):
    all_sentences = [' ']*padding + [sentence for line in dialogues for sentence in line] + [' ']*padding
    return [
        ' '.join(all_sentences[i - padding:i + chunk_size + padding])
        for i in range(padding, len(all_sentences)+padding, chunk_size)
    ]

file_path = 'dialogues_train.txt'
parsed = parse_dialogue_file(file_path, numlines=10)
chunks = make_chunks(parsed)


In [55]:
# stats
import numpy as np
parsed = parse_dialogue_file(file_path)
samples = [len(sub) for sub in parsed]
avg = np.mean(samples)
median = np.median(samples)
print(f"avg: {avg:.2f}, median: {median:.2f}")
combined = combine_dialogues(parsed)

print(len(combined))

avg: 7.84, median: 7.00
5231889


    Say , Jim , how about going for a few beers after dinner ? You know that is tempting but is really not good for our fitness . What do you mean ? It will help us to relax . Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? I guess you are right.But what shall we do ? I don't feel like sitting at home . I suggest a walk over to the gym where we can play singsong and meet some of our friends . That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . Good.Let ' s go now . All right .    
Good.Let ' s go now . All right .    


In [65]:
from sentence_transformers import SentenceTransformer

def encode_chunks(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(chunks, convert_to_tensor=True)
    return embeddings





In [79]:
# similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def encode_text(text, model):
    return model.encode(text, convert_to_tensor=True)

def compare_embeddings(embedding1, embedding2):
    similarity = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
    return similarity[0][0]

input = "Jim, do you remeber the time i ask you out for beer?"
embed_out = encode_chunks(chunks=chunks[0])
embed_in = encode_chunks(input)

similarity_score = compare_embeddings(embed_in, embed_out)
print(f"Similarity score: {similarity_score}")




Similarity score: 0.3417988419532776


In [80]:
# compare with the one in the store

for i, chunk in enumerate(chunks):
    embed_out = encode_chunks(chunk)   
    similarity_score = compare_embeddings(embed_in, embed_out)
    print(f"chunk {i} Similarity score: {similarity_score}")





chunk 0 Similarity score: 0.3417988419532776




chunk 1 Similarity score: 0.12307704985141754




chunk 2 Similarity score: 0.05789489299058914




chunk 3 Similarity score: 0.16222906112670898




chunk 4 Similarity score: 0.07259978353977203




chunk 5 Similarity score: 0.054255276918411255




chunk 6 Similarity score: 0.11087451130151749




chunk 7 Similarity score: 0.12614040076732635




chunk 8 Similarity score: 0.011110194958746433




chunk 9 Similarity score: 0.06175827607512474




chunk 10 Similarity score: 0.06776846200227737
