In [None]:
import numpy as np
from scipy.spatial.distance import cosine
from fastdtw import fastdtw
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [3]:
def read_senteces(filename : str) -> list:
    import re

    with open(filename, 'r') as file:
        text = file.read()
    sentences = re.split(r'[.:!?\n]+', text)
    return sentences

sentences_students = read_senteces('text_students.txt')
sentences_blog = read_senteces('text_blog.txt')


In [None]:
print(sentences_blog)

In [50]:
embeddings_students = model.encode(sentences_students)
embeddings_blog = model.encode(sentences_blog)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
cosine_similarities = cosine_similarity(embeddings_students, embeddings_blog)

print(embeddings_students.shape, embeddings_blog.shape, cosine_similarities.shape)

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Plot the cosine similarities as a heatmap
plt.figure(figsize=(5, 3))
sns.heatmap(cosine_similarities.clip(0.3,1), annot=False, fmt=".2f", cmap='coolwarm')
plt.title('Cosine Similarities Heatmap')
plt.xlabel('Blog Sentences')
plt.ylabel('Student Sentences')
plt.show()

In [None]:
# Compute Dynamic Time Warping distance
distance, path = fastdtw(embeddings_students, embeddings_blog, dist=cosine)

print(f"DTW distance: {distance}")
print(f"DTW path: {path}")

In [None]:
path_ = np.array(path)
avg_cos_dist = distance / path_.shape[0]
print(avg_cos_dist)
# Plot the cosine similarities as a heatmap
plt.figure(figsize=(5, 3))
sns.heatmap(cosine_similarities, annot=False, fmt=".2f", cmap='coolwarm')
plt.plot(path_[:,1]+0.5, path_[:,0]+0.5, color='black')
plt.title(f'Cosine Similarities Heatmap\nAvg DTW distance: {avg_cos_dist:.2f}')
plt.xlabel('Blog Sentences')
plt.ylabel('Student Sentences')
plt.show()

In [None]:
similarities = []
for i in range(path_.shape[0]):
    similarities.append(cosine_similarities[path_[i,0], path_[i,1]])

plt.hist(similarities, bins=20, color='blue', edgecolor='black')
plt.title('Histogram of Similarities')
plt.xlabel('Similarity')
plt.ylabel('Frequency')
plt.show()