In [1]:
import json

# Load the JSON file
with open('embeddings.json', 'r') as file:
    data = json.load(file)

# Extract the embeddings list
embeddings = data.get('embeddings', [])

# Display the first few items to verify the data
for i, item in enumerate(embeddings[:5]):
    print(f"Item {i+1}:")
    print(f"  Title: {item['title']}")
    print(f"  Type: {item['type']}")
    print(f"  Link: {item['link']}")
    print(f"  Embedding: {item['embedding'][:10]}...")  # Show only the first 10 values of the embedding for brevity
    print()


Item 1:
  Title: Generative Camera Dolly: Extreme Monocular Dynamic Novel View Synthesis
  Type: paper
  Link: http://arxiv.org/abs/2405.14868v1
  Embedding: [-0.608298659324646, -0.010770933702588081, 1.0775866508483887, -0.22171369194984436, -0.19240504503250122, -0.40922755002975464, -0.12438789010047913, -0.8952761888504028, 0.7575365304946899, -0.3842252492904663]...

Item 2:
  Title: Privileged Sensing Scaffolds Reinforcement Learning
  Type: paper
  Link: http://arxiv.org/abs/2405.14853v1
  Embedding: [-0.9499078989028931, -0.05496533587574959, 0.8581719398498535, -0.3367142081260681, -0.24041947722434998, -0.4854498505592346, 0.005895907990634441, -0.9775391817092896, 0.9539982080459595, -0.7237332463264465]...

Item 3:
  Title: Learning to Detect and Segment Mobile Objects from Unlabeled Videos
  Type: paper
  Link: http://arxiv.org/abs/2405.14841v1
  Embedding: [-1.0691289901733398, 0.10139864683151245, 0.4131864309310913, -0.3225151598453522, -0.06406674534082413, -0.1533077

In [2]:
import os
from sentence_transformers import SentenceTransformer
import umap
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [14]:
# Combine titles and texts
documents = []
types = []
links = []
embeddings = []

for key in data:
    for entry in data[key]:
        title = entry['title']
        doc_type = entry['type']
        link = entry['link']
        embedding = entry['embedding']
        
        documents.append(title)
        types.append(doc_type)
        links.append(link)
        embeddings.append(embedding)



In [16]:
import umap
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Category10
import numpy as np

# Make sure Bokeh plots render inline in the notebook
output_notebook()

# Perform UMAP dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embedding_2d = umap_model.fit_transform(embeddings)

# Prepare data for Bokeh
source = ColumnDataSource(data=dict(
    x=embedding_2d[:, 0],
    y=embedding_2d[:, 1],
    title=documents,
    doc_type=types,
    link=links
))

# Create a color map for document types
doc_type_list = list(set(types))
color_map = {doc_type: Category10[10][i % 10] for i, doc_type in enumerate(doc_type_list)}
colors = [color_map[doc_type] for doc_type in types]

# Add colors to the data source
source.data['color'] = colors

# Create a Bokeh plot
p = figure(title="UMAP projection of document embeddings", tools="pan,wheel_zoom,reset,hover,save",
           tooltips=[("Title", "@title"), ("Type", "@doc_type"), ("Link", "@link")])

p.scatter('x', 'y', color='color', legend_field='doc_type', source=source, alpha=0.6)

# Customize plot appearance
p.legend.title = 'Document Type'
p.legend.location = 'top_left'

# Show the plot
show(p)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [35]:
# Define the file paths
file_details = {
    "articles": {"path": "streamlit-app/documents/fow/articles.json", "type": "article"},
    "hackernews": {"path": "streamlit-app/documents/fow/hackernews.json", "type": "hacker-news"},
    "papers": {"path": "streamlit-app/documents/fow/papers.json", "type": "paper"},
    "reddit": {"path": "streamlit-app/documents/fow/reddit.json", "type": "reddit"}
}

# Load the json data with type tracking
data = {}
for key, details in file_details.items():
    path = details["path"]
    doc_type = details["type"]
    with open(path, 'r') as file:
        entries = json.load(file)
        data[key] = [(entry['title'], entry['text'], doc_type) for entry in entries]

# Combine titles and texts
documents = []
types = []

for key in data:
    for entry in data[key]:
        title, text, doc_type = entry
        documents.append(title + " " + text[:175])
        types.append(doc_type)

# Preprocess the text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

processed_documents = [preprocess_text(doc) for doc in documents]

In [26]:
from sentence_transformers import SentenceTransformer


# Generate SentenceTransformers embeddings
model = SentenceTransformer('all-mpnet-base-v2')
sentence_embeddings = model.encode(processed_documents)


# Perform UMAP dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embedding_2d = umap_model.fit_transform(sentence_embeddings)

# Prepare data for Bokeh
source = ColumnDataSource(data=dict(
    x=embedding_2d[:, 0],
    y=embedding_2d[:, 1],
    title=documents,
    doc_type=types,
    #link=links
))

# Create a color map for document types
doc_type_list = list(set(types))
color_map = {doc_type: Category10[10][i % 10] for i, doc_type in enumerate(doc_type_list)}
colors = [color_map[doc_type] for doc_type in types]

# Add colors to the data source
source.data['color'] = colors

# Create a Bokeh plot
p = figure(title="UMAP projection of document embeddings", tools="pan,wheel_zoom,reset,hover,save",
           tooltips=[("Title", "@title"), ("Type", "@doc_type")])

p.scatter('x', 'y', color='color', legend_field='doc_type', source=source, alpha=0.6)

# Customize plot appearance
p.legend.title = 'Document Type'
p.legend.location = 'top_left'

# Show the plot
show(p)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [27]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_documents)

# Generate SentenceTransformers embeddings
model = SentenceTransformer('all-mpnet-base-v2')
sentence_embeddings = model.encode(processed_documents)

# Concatenate TF-IDF features with sentence embeddings
tfidf_features = tfidf_matrix.toarray()
combined_features = np.concatenate([tfidf_features, sentence_embeddings], axis=1)

# Perform UMAP dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embedding_2d = umap_model.fit_transform(combined_features)

# Prepare data for Bokeh
source = ColumnDataSource(data=dict(
    x=embedding_2d[:, 0],
    y=embedding_2d[:, 1],
    title=documents,
    doc_type=types,
    #link=links
))

# Create a color map for document types
doc_type_list = list(set(types))
color_map = {doc_type: Category10[10][i % 10] for i, doc_type in enumerate(doc_type_list)}
colors = [color_map[doc_type] for doc_type in types]

# Add colors to the data source
source.data['color'] = colors

# Create a Bokeh plot
p = figure(title="UMAP projection of document embeddings", tools="pan,wheel_zoom,reset,hover,save",
           tooltips=[("Title", "@title"), ("Type", "@doc_type")])

p.scatter('x', 'y', color='color', legend_field='doc_type', source=source, alpha=0.6)

# Customize plot appearance
p.legend.title = 'Document Type'
p.legend.location = 'top_left'

# Show the plot
show(p)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [40]:
# Preprocess the text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

processed_documents = [preprocess_text(doc) for doc in documents]

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_documents)

# Identify low tf-idf words
tfidf_scores = tfidf_matrix.max(axis=0).toarray().ravel()
low_tfidf_threshold = 0.1  # You can adjust this threshold
low_tfidf_words = [word for word, score in zip(tfidf_vectorizer.get_feature_names_out(), tfidf_scores) if score < low_tfidf_threshold]

# Update stopwords list to include low tf-idf words
custom_stopwords = set(stopwords.words('english')).union(set(low_tfidf_words))

# Updated preprocessing function
def preprocess_text_with_custom_stopwords(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in custom_stopwords]
    return ' '.join(tokens)

# Re-process documents with the updated stopwords list
processed_documents = [preprocess_text_with_custom_stopwords(doc) for doc in documents]

# Generate SentenceTransformers embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
sentence_embeddings = model.encode(processed_documents)

# Perform UMAP dimensionality reduction
umap_model = umap.UMAP(n_neighbors=4, min_dist=0.1, n_components=4, random_state=42)
embedding_2d = umap_model.fit_transform(sentence_embeddings)

# Prepare data for Bokeh
source = ColumnDataSource(data=dict(
    x=embedding_2d[:, 0],
    y=embedding_2d[:, 1],
    title=documents,
    doc_type=types,
    #link=links
))

# Create a color map for document types
doc_type_list = list(set(types))
color_map = {doc_type: Category10[10][i % 10] for i, doc_type in enumerate(doc_type_list)}
colors = [color_map[doc_type] for doc_type in types]

# Add colors to the data source
source.data['color'] = colors

# Create a Bokeh plot
p = figure(title="UMAP projection of document embeddings", tools="pan,wheel_zoom,reset,hover,save",
           tooltips=[("Title", "@title"), ("Type", "@doc_type")])

p.scatter('x', 'y', color='color', legend_field='doc_type', source=source, alpha=0.6)

# Customize plot appearance
p.legend.title = 'Document Type'
p.legend.location = 'top_left'

# Show the plot
show(p)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

