In [5]:
def parse_dialogue_file(file_path: str, numlines: int = None):
    parsed_dialogues = []

    with open(file_path, 'r') as file:
        for i, line in enumerate(file):    
            if numlines is not None and i == numlines:  break
            turns = line.strip().split('__eou__')
            turns = [turn.strip() for turn in turns if turn.strip()]
            parsed_dialogues.append(turns)

    return parsed_dialogues

def make_chunks(dialogues, chunk_size=8, padding=2):
    all_sentences = [' ']*padding + [sentence for line in dialogues for sentence in line] + [' ']*padding
    return [
        ' '.join(all_sentences[i - padding:i + chunk_size + padding])
        for i in range(padding, len(all_sentences)+padding, chunk_size)
    ]

file_path = 'dialogues_train.txt'
parsed = parse_dialogue_file(file_path, numlines=10)
chunks = make_chunks(parsed)


In [15]:
from sentence_transformers import SentenceTransformer

def encode_chunks(chunks):
    model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L12-cos-v5', trust_remote_code=True)
    embeddings = model.encode(chunks)
    return embeddings.tolist()

In [16]:
# similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def encode_text(text, model):
    return model.encode(text, convert_to_tensor=True)

def compare_embeddings(embedding1, embedding2):
    similarity = cosine_similarity([embedding1], [embedding2])
    return similarity[0][0]

input = "Jim, do you remeber the time i ask you out for beer?"
embed_out = encode_chunks(chunks    =chunks[0])
embed_in = encode_chunks(input)

similarity_score = compare_embeddings(embed_in, embed_out)
print(f"Similarity score: {similarity_score}")




Similarity score: 0.30492570287079035


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(list1, list2):
    # Convert lists to 2D arrays as required by scikit-learn
    vec1 = [list1]
    vec2 = [list2]

    # Compute the cosine similarity
    similarity = cosine_similarity(vec1, vec2)

    # cosine_similarity returns a 2D array, so we need to extract the value
    return similarity[0][0]
list1 = [1, 2, 3]
list2 = [1, 2, 4]
similarity = calculate_cosine_similarity(list1, list2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.9914601339836674


In [9]:
print(chunks[0])

    Say , Jim , how about going for a few beers after dinner ? You know that is tempting but is really not good for our fitness . What do you mean ? It will help us to relax . Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? I guess you are right.But what shall we do ? I don't feel like sitting at home . I suggest a walk over to the gym where we can play singsong and meet some of our friends . That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . Good.Let ' s go now . All right .


In [14]:
# compare with the one in the store
chunks.append("What are your responsibilities in your present work ? My work involves various routine bookkeeping and basic accounting tasks including journal entries , verifying data and reconciling discrepancies , preparing detailed reports from raw data , and checking accounting documents for completeness , mathematical accuracy and consistency . Are you familiar with the PRC Financial and Tax Regulations ? I think so . Can you tell me something about this balance sheet now ? Of course . This balance sheet contains three major sections , that is , assets , liabilities and owner's equity . So , you see , the total current liabilities of your company are $ 3,372 , 000 , and the owner's equity is $ 5,400 , 000 . That means that the total assets , which is equal to the sum of the creditor's and the owner's equities , are $ 8,772 , 000 . What's the creditor's equity ? The creditor's equity is the same as liabilities . Will you tell me the situation ? I was in my friend's room talking for an hour of so . And then ? I came back to my room and found that my suitcase was open and my camera and five hundred dollars in cash inside the wallet were gone .")
for i, chunk in enumerate(chunks):
    embed_out = encode_chunks(chunk)   
    similarity_score = compare_embeddings(embed_in, embed_out)
    print(f"chunk {i} Similarity score: {similarity_score}")





chunk 0 Similarity score: 0.30492570287079035




chunk 1 Similarity score: 0.22387367742001918




chunk 2 Similarity score: 0.19060243650162173




chunk 3 Similarity score: 0.1288242875911181




chunk 4 Similarity score: -0.028113413475613434




chunk 5 Similarity score: 0.05180320206651985




chunk 6 Similarity score: 0.1178276812252784




chunk 7 Similarity score: 0.16093966368950224




chunk 8 Similarity score: 0.29293096181321504




chunk 9 Similarity score: 0.20437515405198486




chunk 10 Similarity score: 0.18013032452969915




chunk 11 Similarity score: -0.15174401830351064




chunk 12 Similarity score: -0.14995562429107617




chunk 13 Similarity score: -0.14995562429107617
