In [1]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample paragraphs
paragraphs = [
    "Dog and cat are common house pets.",
    "Computer programming involves coding.",
    "Sports like soccer and basketball are popular.",
    "Music genres include rock and classical.",
    "Baking bread requires yeast and flour."
]

In [2]:
tagged_data = [TaggedDocument(words=paragraph.split(), tags=[str(i)]) for i, paragraph in enumerate(paragraphs)]

In [3]:
# Initialize the model
model = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100)

# Build the vocabulary
model.build_vocab(tagged_data)

# Train the model
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [4]:
paragraph_vectors = [model.infer_vector(paragraph.split()) for paragraph in paragraphs]

In [5]:
# Convert list to numpy array for compatibility
paragraph_vectors = np.array(paragraph_vectors)

# Compute the similarity matrix
similarity_matrix = cosine_similarity(paragraph_vectors)

In [None]:
# Print the similarity matrix with associated paragraphs
print("Similarity Matrix:")
for i in range(len(paragraphs)):
    for j in range(len(paragraphs)):
        if i != j:
            print(f"Similarity between:")
            print(f"  Paragraph {i + 1}: {paragraphs[i]}")
            print(f"  Paragraph {j + 1}: {paragraphs[j]}")
            print(f"  Score: {similarity_matrix[i][j]:.4f}")
            print("----")