In [None]:
#!pip install bertopic datasets -q
#!pip install umap-learn altair annoy   -q

import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import umap
import altair as alt

def load_and_preprocess_data(dataset_name="Intel/orca_dpo_pairs"):
    """
    Load the dataset and preprocess it by concatenating questions and chosen answers.
    Returns a DataFrame with processed text and original questions for titles.
    """
    try:
        docs_ = load_dataset(dataset_name)["train"]
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None
    
    docs_q_ = docs_["question"]
    docs_c_ = docs_["chosen"]
    docs = [x + " " + y for x, y in zip(docs_q_, docs_c_)]
    
    # Trim words to manage token sequence length effectively. 
    # A modified approach is required if a significant percentage of the content is consistenly longer 
    docs_q_trimmed = [x[:4000] for x in docs_q_]
    
    return pd.DataFrame({'title': docs_q_trimmed, 'text': docs})

def compute_embeddings(docs):
    """
    Compute embeddings for the given documents using a predefined model.
    """
    embedding_model = SentenceTransformer("BAAI/bge-small-en")
    embeddings = embedding_model.encode(docs, show_progress_bar=True)
    return embeddings

def create_umap_embeddings(embeddings, n_neighbors=15):
    """
    Reduce the dimensionality of embeddings using UMAP.
    """
    reducer = umap.UMAP(n_neighbors=n_neighbors, random_state=42)
    umap_embeddings = reducer.fit_transform(embeddings)
    return umap_embeddings

def cluster_data(embeddings, n_clusters=30):
    """
    Cluster the embeddings into a specified number of clusters using KMeans.
    """
    kmeans_model = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans_model.fit_predict(embeddings)
    return clusters

def extract_keywords(df):
    """
    Extract and assign keywords for each cluster using TF-IDF.
    """
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(df['text'])
    words = tfidf_vectorizer.get_feature_names_out()
    
    # Calculate scores for each word in each cluster
    tfidf_scores = tfidf.toarray()
    
    # Extract the top 10 words per cluster
    top_n_words = 10
    keywords_per_cluster = {i: ", ".join(words[tfidf_scores[i].argsort()[-top_n_words:]]) for i in range(len(tfidf_scores))}
    df['keywords'] = df['cluster'].apply(lambda x: keywords_per_cluster[x])
    
    return df

def visualize_clusters(df):
    """
    Create an interactive visualization of the clustered documents.
    """
    chart = alt.Chart(df).mark_circle(size=60, stroke='#666', strokeWidth=1, opacity=0.3).encode(
        x='x:Q',
        y='y:Q',
        color='keywords:N',
        tooltip=['title', 'keywords', 'cluster']
    ).properties(
        width=800,
        height=500,
        title='Intel Orca DPO Dataset Clusters'
    ).interactive()
    
    return chart



In [None]:
# Main script execution
if __name__ == "__main__":
    df = load_and_preprocess_data()
    if df is not None:
        embeddings = compute_embeddings(df['text'])
        umap_embeddings = create_umap_embeddings(embeddings)
        df['x'], df['y'] = umap_embeddings[:, 0], umap_embeddings[:, 1]
        clusters = cluster_data(embeddings)
        df['cluster'] = clusters
        df = extract_keywords(df)
        chart = visualize_clusters(df)
        chart.display()
    else:
        print("Failed to load or preprocess data.")