# For Cluster

In [None]:
scp -J yaron.otmazgin@bava.cs.huji.ac.il \
    yaron.otmazgin@moriah-gw.cs.huji.ac.il:/sci/labs/orzuk/orzuk/teaching/big_data_project_52017/2024_25/arxiv_data/arxiv-metadata-oai-snapshot.json.zip \
    ~/Downloads/

SyntaxError: invalid syntax (ipython-input-1-4052150592.py, line 1)

# Imports

In [None]:
# relevant libraries for reading the JSON and converting to pandas

from google.colab import drive
drive.mount('/content/drive')
import warnings
warnings.filterwarnings('ignore')
import json
import pandas as pd

# relevant libraries for the filter model

import numpy as np
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords


MessageError: Error: credential propagation was unsuccessful

# Reading the JSON and converting to DF

In [None]:

# Read the JSON lines file
articles = []
with open('/content/drive/My Drive/arxiv_sample.json', 'r') as f:
    for line in f:
        articles.append(json.loads(line))

# convert from JSON to dataframe
df = pd.DataFrame(articles)


In [None]:
# view an example using the JSON format
print(json.dumps(articles[0], indent=2))  # Pretty print first article

In [None]:
# view the head of the dataframe format
df.head()

# Filter Phase

## Clustering

In [None]:


# 1. Document Embedding
def create_embeddings(abstracts):
    # Load pre-trained SBERT model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    # Generate embeddings for abstracts
    embeddings = model.encode(abstracts)
    return embeddings

# 2. Dimension Reduction
def reduce_dimensions(embeddings, n_components=5):
    # Apply UMAP for dimensionality reduction
    umap_model = UMAP(n_components=n_components,
                      n_neighbors=15,
                      min_dist=0.1,
                      random_state=42)
    reduced_embeddings = umap_model.fit_transform(embeddings)
    return reduced_embeddings

# 3. Document Clustering
def cluster_documents(reduced_embeddings):
    # Apply HDBSCAN for clustering
    clusterer = HDBSCAN(min_cluster_size=10,
                        min_samples=2,
                        metric='euclidean')
    clusters = clusterer.fit_predict(reduced_embeddings)
    return clusters

# 4. Simple function to get clusters and their documents
def get_cluster_documents(df, clusters):
    df['cluster'] = clusters
    # Return a dictionary of cluster IDs mapped to document indices
    cluster_docs = {}
    unique_clusters = np.unique(clusters)
    for cluster_id in unique_clusters:
        if cluster_id != -1:  # Skip noise points (HDBSCAN assigns -1 to noise)
            cluster_docs[cluster_id] = df[df['cluster'] == cluster_id].index.tolist()
    return cluster_docs

# Main function to run the pipeline
def semantic_prefilter(df):
    abstracts = df['abstract'].tolist()

    # Step 1: Create embeddings
    embeddings = create_embeddings(abstracts)

    # Step 2: Reduce dimensions
    reduced_embeddings = reduce_dimensions(embeddings)

    # Step 3: Cluster documents
    clusters = cluster_documents(reduced_embeddings)

    # Step 4: Get clusters and their documents
    cluster_docs = get_cluster_documents(df, clusters)

    return cluster_docs, embeddings, clusters

# Run the pipeline
cluster_docs, embeddings, clusters = semantic_prefilter(df)

# Print some basic statistics
print(f"Number of clusters found: {len(cluster_docs)}")
print(f"Distribution of documents across clusters: {[len(docs) for docs in cluster_docs.values()]}")

NameError: name 'df' is not defined

## Visualizing clusters and common words

In [None]:
def visualize_clusters(embeddings, clusters):
    # First reduce to 2D for visualization regardless of previous reduction
    umap_2d = UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
    vis_embeddings = umap_2d.fit_transform(embeddings)

    # Create a DataFrame for easier plotting
    vis_df = pd.DataFrame({
        'x': vis_embeddings[:, 0],
        'y': vis_embeddings[:, 1],
        'cluster': clusters
    })

    # Set up plot size
    plt.figure(figsize=(20, 16))

    # Create a color palette that handles noise points (-1) separately
    unique_clusters = np.unique(clusters)
    num_clusters = len([c for c in unique_clusters if c != -1])
    palette = sns.color_palette("hsv", num_clusters)
    colors = {i: palette[i] for i in range(num_clusters)}
    colors[-1] = (0.5, 0.5, 0.5)  # Gray for noise points

    # Plot each cluster
    for cluster_id in unique_clusters:
        cluster_data = vis_df[vis_df['cluster'] == cluster_id]
        plt.scatter(
            cluster_data['x'],
            cluster_data['y'],
            c=[colors[cluster_id]] * len(cluster_data),
            label=f'Cluster {cluster_id}' if cluster_id != -1 else 'Noise',
            alpha=0.7 if cluster_id != -1 else 0.3,
            s=50
        )

    # Add labels and title
    plt.title('Document Clusters Visualization', fontsize=16)
    plt.xlabel('UMAP Dimension 1', fontsize=12)
    plt.ylabel('UMAP Dimension 2', fontsize=12)

    # Place the legend at the bottom of the plot
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),
               ncol=min(13, len(unique_clusters)),  # Adjust number of columns based on cluster count
               frameon=True, fancybox=True, shadow=True)

    # Adjust layout to make room for the legend at the bottom
    plt.tight_layout(rect=[0, 0.1, 1, 0.95])  # [left, bottom, right, top]

    return plt

def visualize_cluster_sample(df, embeddings, clusters, n_samples=3):
    """Visualize clusters and display sample titles from each cluster"""
    plot = visualize_clusters(embeddings, clusters)

    # Display sample documents from each cluster
    print("Sample documents from each cluster:")
    unique_clusters = np.unique(clusters)
    for cluster_id in unique_clusters:
        if cluster_id == -1:
            continue  # Skip noise points

        cluster_docs = df[df['cluster'] == cluster_id]
        sample_docs = cluster_docs.sample(min(n_samples, len(cluster_docs)))

        print(f"\nCluster {cluster_id} ({len(cluster_docs)} documents):")
        for idx, row in sample_docs.iterrows():
            print(f"  - {row['title']}")

    plot.show()

# Call the enhanced visualization function
visualize_cluster_sample(df, embeddings, clusters)


In [None]:

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

def get_top_words_per_cluster(df, clusters, n_words=3):
    """Get the most common words for each cluster"""
    # Initialize stopwords
    stop_words = set(stopwords.words('english'))

    # Add some domain-specific stopwords that might be common in academic papers
    domain_stops = {'using', 'paper', 'approach', 'method', 'model', 'propose', 'based', 'results', 'data', 'proposed'}
    stop_words.update(domain_stops)

    cluster_top_words = {}
    unique_clusters = np.unique(clusters)

    for cluster_id in unique_clusters:
        if cluster_id == -1:  # Skip noise points
            continue

        # Get all abstracts for this cluster
        cluster_docs = df[clusters == cluster_id]

        # Combine all text
        all_text = ' '.join(cluster_docs['abstract'].fillna(''))

        # Basic preprocessing
        all_text = all_text.lower()
        # Remove special characters and numbers
        all_text = re.sub(r'[^a-zA-Z\s]', '', all_text)

        # Tokenize
        words = all_text.split()

        # Remove stopwords
        filtered_words = [word for word in words if word not in stop_words and len(word) > 3]

        # Get most common words
        word_counts = Counter(filtered_words)
        top_words = [word for word, count in word_counts.most_common(n_words)]

        cluster_top_words[cluster_id] = top_words

    return cluster_top_words

def visualize_cluster_words(df, embeddings, clusters, n_words=3):
    # First reduce to 2D for visualization
    umap_2d = UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
    vis_embeddings = umap_2d.fit_transform(embeddings)

    # Get top words for each cluster
    top_words = get_top_words_per_cluster(df, clusters, n_words)

    # Calculate cluster centers
    cluster_centers = {}
    unique_clusters = np.unique(clusters)

    for cluster_id in unique_clusters:
        if cluster_id == -1:  # Skip noise points
            continue

        # Get points in this cluster
        cluster_points = vis_embeddings[clusters == cluster_id]
        # Calculate center
        center_x = np.mean(cluster_points[:, 0])
        center_y = np.mean(cluster_points[:, 1])

        cluster_centers[cluster_id] = (center_x, center_y)

    # Set up plot size - make it larger
    plt.figure(figsize=(20, 16))

    # Create a color palette
    num_clusters = len([c for c in unique_clusters if c != -1])
    palette = sns.color_palette("hsv", num_clusters)
    colors = {i: palette[i] for i in range(num_clusters)}

    # First plot a faint scatter of all points to show distribution
    for cluster_id in unique_clusters:
        if cluster_id == -1:  # Skip noise
            continue

        cluster_points = vis_embeddings[clusters == cluster_id]
        plt.scatter(
            cluster_points[:, 0],
            cluster_points[:, 1],
            c=[colors[cluster_id]],
            alpha=0.1,  # Very faint
            s=10
        )

    # Now plot the words in circles
    for cluster_id, center in cluster_centers.items():
        if cluster_id not in top_words:
            continue

        cx, cy = center

        # Draw a circle around all three words
        circle = plt.Circle((cx, cy), 0.35, fill=True, alpha=0.2, color=colors[cluster_id],
                            edgecolor=colors[cluster_id], linewidth=2)
        plt.gca().add_patch(circle)

        # Join the top words and display them in the center
        word_text = '\n'.join(top_words[cluster_id])

        # Add the words
        plt.text(cx, cy, word_text,
                 fontsize=12,
                 ha='center',
                 va='center',
                 color=colors[cluster_id],
                 weight='bold',
                 bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.3'))

    # Add labels and title
    plt.title('Cluster Topic Words Visualization', fontsize=20)
    plt.xlabel('UMAP Dimension 1', fontsize=14)
    plt.ylabel('UMAP Dimension 2', fontsize=14)

    # Add a legend for cluster colors
    legend_elements = []
    for cluster_id in sorted(cluster_centers.keys()):
        legend_elements.append(plt.Line2D([0], [0], marker='o', color='w',
                               markerfacecolor=colors[cluster_id], markersize=10,
                               label=f'Cluster {cluster_id}'))

    plt.legend(handles=legend_elements, loc='upper center',
               bbox_to_anchor=(0.5, -0.05), ncol=min(13, len(cluster_centers)))

    # Remove axis ticks for cleaner look
    plt.xticks([])
    plt.yticks([])

    # Tight layout
    plt.tight_layout()

    return plt

# Call the visualization function
plot = visualize_cluster_words(df, embeddings, clusters)
plot.show()

In [None]:
clusters

In [None]:
# prompt: how do i know the number of samples in the json

print(f"Number of samples in the JSON: {len(articles)}")