In [22]:
import json
import os

# Define the file paths
file_details = {
    "articles": {"path": "documents/articles.json", "type": "article"},
    "hackernews": {"path": "documents/hackernews.json", "type": "hacker-news"},
    "papers": {"path": "documents/papers.json", "type": "paper"},
    "reddit": {"path": "documents/reddit.json", "type": "reddit"}
}

# Load the json data with type tracking
data = {}
for key, details in file_details.items():
    path = details["path"]
    doc_type = details["type"]
    with open(path, 'r') as file:
        entries = json.load(file)
        data[key] = [(entry['title'], entry['text'], doc_type) for entry in entries]

# Combine titles and texts
documents = []
types = []

for key in data:
    for entry in data[key]:
        title, text, doc_type = entry
        documents.append(title + " " + text)
        types.append(doc_type)

# Preprocess the text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

processed_documents = [preprocess_text(doc) for doc in documents]

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_embeddings = tfidf_vectorizer.fit_transform(processed_documents).toarray()

# Create tagged documents for Doc2Vec
tagged_documents = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(processed_documents)]

# Initialize and train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=40)
doc2vec_model.build_vocab(tagged_documents)
doc2vec_model.train(tagged_documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Infer embeddings
doc2vec_embeddings = [doc2vec_model.infer_vector(doc.split()) for doc in processed_documents]

In [23]:
import umap
import plotly.express as px

# Apply UMAP to TF-IDF embeddings
umap_tfidf = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
tfidf_umap_embeddings = umap_tfidf.fit_transform(tfidf_embeddings)

# Apply UMAP to Doc2Vec embeddings
umap_doc2vec = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
doc2vec_umap_embeddings = umap_doc2vec.fit_transform(doc2vec_embeddings)

# Combine titles, texts, and types for hover information
hover_texts = [f"Title: {entry[0]}" for key in data for entry in data[key]]
document_types = [entry[2] for key in data for entry in data[key]]

# Create interactive plot for TF-IDF embeddings
fig_tfidf = px.scatter(
    x=tfidf_umap_embeddings[:, 0], y=tfidf_umap_embeddings[:, 1],
    color=document_types, hover_data={'text': hover_texts},
    labels={'x': 'UMAP 1', 'y': 'UMAP 2'},
    title='UMAP Projection of TF-IDF Embeddings'
)

# Update hover data
fig_tfidf.update_traces(marker=dict(size=5),
                        selector=dict(mode='markers+text'))

fig_tfidf.show()

# Create interactive plot for Doc2Vec embeddings
fig_doc2vec = px.scatter(
    x=doc2vec_umap_embeddings[:, 0], y=doc2vec_umap_embeddings[:, 1],
    color=document_types, hover_data={'text': hover_texts},
    labels={'x': 'UMAP 1', 'y': 'UMAP 2'},
    title='UMAP Projection of Doc2Vec Embeddings'
)

# Update hover data
fig_doc2vec.update_traces(marker=dict(size=5),
                          selector=dict(mode='markers+text'))

fig_doc2vec.show()


In [32]:
from sentence_transformers import SentenceTransformer


# Generate SentenceTransformers embeddings
model = SentenceTransformer('all-mpnet-base-v2')
sentence_embeddings = model.encode(processed_documents)

# Apply UMAP to SentenceTransformers embeddings
umap_sentence = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
sentence_umap_embeddings = umap_sentence.fit_transform(sentence_embeddings)

# Combine titles, texts, and types for hover information
hover_texts = [f"Title: {entry[0]}" for key in data for entry in data[key]]
document_types = [entry[2] for key in data for entry in data[key]]

# Create interactive plot for SentenceTransformers embeddings
fig_sentence = px.scatter(
    x=sentence_umap_embeddings[:, 0], y=sentence_umap_embeddings[:, 1],
    color=document_types, hover_data={'text': hover_texts},
    labels={'x': 'UMAP 1', 'y': 'UMAP 2'},
    title='UMAP Projection of SentenceTransformers Embeddings'
)

# Update hover data
fig_sentence.update_traces(marker=dict(size=5),
                           selector=dict(mode='markers+text'))

fig_sentence.show()

In [None]:
# click 
# zoom in

# wants another query