In [1]:
import json
import os
from sentence_transformers import SentenceTransformer
import umap
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Define the file paths
file_details = {
    "articles": {"path": "documents/articles.json", "type": "article"},
    "hackernews": {"path": "documents/hackernews.json", "type": "hacker-news"},
    "papers": {"path": "documents/papers.json", "type": "paper"},
    "reddit": {"path": "documents/reddit.json", "type": "reddit"}
}

# Load the json data with type tracking
data = {}
for key, details in file_details.items():
    path = details["path"]
    doc_type = details["type"]
    with open(path, 'r') as file:
        entries = json.load(file)
        data[key] = [(entry['title'], entry['text'], doc_type) for entry in entries]

# Combine titles and texts
documents = []
types = []

for key in data:
    for entry in data[key]:
        title, text, doc_type = entry
        documents.append(title + " " + text)
        types.append(doc_type)

# Preprocess the text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

processed_documents = [preprocess_text(doc) for doc in documents]

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_embeddings = tfidf_vectorizer.fit_transform(processed_documents).toarray()

# Create tagged documents for Doc2Vec
tagged_documents = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(processed_documents)]

# Initialize and train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=40)
doc2vec_model.build_vocab(tagged_documents)
doc2vec_model.train(tagged_documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Infer embeddings
doc2vec_embeddings = [doc2vec_model.infer_vector(doc.split()) for doc in processed_documents]

In [23]:
import umap
import plotly.express as px

# Apply UMAP to TF-IDF embeddings
umap_tfidf = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
tfidf_umap_embeddings = umap_tfidf.fit_transform(tfidf_embeddings)

# Apply UMAP to Doc2Vec embeddings
umap_doc2vec = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
doc2vec_umap_embeddings = umap_doc2vec.fit_transform(doc2vec_embeddings)

# Combine titles, texts, and types for hover information
hover_texts = [f"Title: {entry[0]}" for key in data for entry in data[key]]
document_types = [entry[2] for key in data for entry in data[key]]

# Create interactive plot for TF-IDF embeddings
fig_tfidf = px.scatter(
    x=tfidf_umap_embeddings[:, 0], y=tfidf_umap_embeddings[:, 1],
    color=document_types, hover_data={'text': hover_texts},
    labels={'x': 'UMAP 1', 'y': 'UMAP 2'},
    title='UMAP Projection of TF-IDF Embeddings'
)

# Update hover data
fig_tfidf.update_traces(marker=dict(size=5),
                        selector=dict(mode='markers+text'))

fig_tfidf.show()

# Create interactive plot for Doc2Vec embeddings
fig_doc2vec = px.scatter(
    x=doc2vec_umap_embeddings[:, 0], y=doc2vec_umap_embeddings[:, 1],
    color=document_types, hover_data={'text': hover_texts},
    labels={'x': 'UMAP 1', 'y': 'UMAP 2'},
    title='UMAP Projection of Doc2Vec Embeddings'
)

# Update hover data
fig_doc2vec.update_traces(marker=dict(size=5),
                          selector=dict(mode='markers+text'))

fig_doc2vec.show()


In [2]:
from sentence_transformers import SentenceTransformer


# Generate SentenceTransformers embeddings
model = SentenceTransformer('all-mpnet-base-v2')
sentence_embeddings = model.encode(processed_documents)

# Apply UMAP to SentenceTransformers embeddings
umap_sentence = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
sentence_umap_embeddings = umap_sentence.fit_transform(sentence_embeddings)

# Combine titles, texts, and types for hover information
hover_texts = [f"Title: {entry[0]}" for key in data for entry in data[key]]
document_types = [entry[2] for key in data for entry in data[key]]

# Create interactive plot for SentenceTransformers embeddings
fig_sentence = px.scatter(
    x=sentence_umap_embeddings[:, 0], y=sentence_umap_embeddings[:, 1],
    color=document_types, hover_data={'text': hover_texts},
    labels={'x': 'UMAP 1', 'y': 'UMAP 2'},
    title='UMAP Projection of SentenceTransformers Embeddings'
)

# Update hover data
fig_sentence.update_traces(marker=dict(size=5),
                           selector=dict(mode='markers+text'))

fig_sentence.show()

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [23]:
sentence_embeddings

array([[-0.01981317,  0.0517722 , -0.0261653 , ..., -0.00884733,
        -0.0388605 , -0.03333718],
       [ 0.04656151,  0.02548499,  0.04058413, ...,  0.03091259,
        -0.0838467 ,  0.01230207],
       [ 0.0959978 ,  0.02622402, -0.01823433, ..., -0.00055361,
        -0.00695292, -0.01642639],
       ...,
       [ 0.00591025,  0.0455336 , -0.00619785, ...,  0.02778248,
        -0.03626199, -0.01703815],
       [-0.0053145 , -0.03429687, -0.01151506, ..., -0.02154231,
         0.00887097,  0.01017942],
       [ 0.0277106 , -0.03337168, -0.03470176, ...,  0.0040071 ,
        -0.00153114, -0.0290713 ]], dtype=float32)

In [None]:
# click 
# zoom in

# wants another query

In [3]:
from openai import OpenAI



In [63]:
import os


zsh:1: bad assignment


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [42]:
client = OpenAI()

# Generate OpenAI embeddings
def get_openai_embeddings(texts):
    responses = []
    for text in texts:
        if len(text) > 8192:
            text = text[:8192]
        response = client.embeddings.create(input = [text], model="text-embedding-3-small").data[0].embedding
        print(response)
        responses.append(response)
    return responses

openai_embeddings = get_openai_embeddings(processed_documents)

# Apply UMAP to OpenAI embeddings
umap_openai = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
openai_umap_embeddings = umap_openai.fit_transform(openai_embeddings)

# Combine titles, texts, and types for hover information
hover_texts = [f"Title: {entry[0]}" for key in data for entry in data[key]]
document_types = [entry[2] for key in data for entry in data[key]]

# Create interactive plot for OpenAI embeddings
fig_openai = px.scatter(
    x=openai_umap_embeddings[:, 0], y=openai_umap_embeddings[:, 1],
    color=document_types, hover_data={'text': hover_texts},
    labels={'x': 'UMAP 1', 'y': 'UMAP 2'},
    title='UMAP Projection of OpenAI Embeddings'
)

# Update hover data
fig_openai.update_traces(marker=dict(size=5),
                         selector=dict(mode='markers+text'))

fig_openai.show()

[0.00904062669724226, -0.04696352034807205, 0.01869291253387928, -0.009890982881188393, 0.005590717773884535, -0.033268313854932785, 0.0162313561886549, 0.05824192613363266, -0.013456510379910469, 0.010912901721894741, 0.04517329856753349, 0.0037352261133491993, -0.006758092436939478, -0.032999780029058456, 0.007164622191339731, 0.03100069798529148, -0.03774387016892433, -0.025630028918385506, -0.005635473411530256, 0.032910268753767014, -0.03189580887556076, 0.06188204512000084, 0.04502411186695099, 0.03365619480609894, -0.011091924272477627, 0.023735376074910164, 0.020184766501188278, 0.015395918861031532, 0.050394780933856964, 0.003214942291378975, 0.0784117728471756, -0.02946408838033676, -0.020184766501188278, -0.03884784132242203, -0.04773928225040436, 0.020766587927937508, -0.027748458087444305, -0.021557269617915154, -0.007429426070302725, 0.0162313561886549, 0.005575799383223057, -0.01742483861744404, -0.0011571182403713465, 0.03461097925901413, -0.014187518507242203, -0.05114

In [36]:
len(openai_embeddings[50])

1536

In [33]:
len(sentence_embeddings[1])

768

In [37]:
# Apply UMAP to SentenceTransformers embeddings
umap_sentence = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
openai_umap_embeddings = umap_sentence.fit_transform(openai_embeddings)

# Combine titles, texts, and types for hover information
hover_texts = [f"Title: {entry[0]}" for key in data for entry in data[key]]
document_types = [entry[2] for key in data for entry in data[key]]

# Create interactive plot for SentenceTransformers embeddings
fig_sentence = px.scatter(
    x=openai_umap_embeddings[:, 0], y=openai_umap_embeddings [:, 1],
    color=document_types, hover_data={'text': hover_texts},
    labels={'x': 'UMAP 1', 'y': 'UMAP 2'},
    title='UMAP Projection of SentenceTransformers Embeddings'
)

# Update hover data
fig_sentence.update_traces(marker=dict(size=5),
                           selector=dict(mode='markers+text'))

fig_sentence.show()

ValueError: All arguments should have the same length. The length of hover_data key `text` is 85, whereas the length of previously-processed arguments ['x', 'y'] is 75