# clustering the dataset

## loading the dataset

In [41]:
%pip install orjson

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [42]:
import orjson

dataset = []
with open("dataset.jsonl", "r") as f:
  for line in f:
    dataset.append(orjson.loads(line))

queries = [data['query'] for data in dataset]
codes = [data['code'] for data in dataset]

### loading models

In [23]:
%pip install bertopic sentence-transformers transformers umap-learn hdbscan

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

query_model = SentenceTransformer("all-MiniLM-L6-v2")
code_model  = SentenceTransformer("microsoft/codebert-base")

query_embeddings = query_model.encode(queries, show_progress_bar=True)
code_embeddings  = code_model.encode(codes, show_progress_bar=True)

embeddings = np.hstack((query_embeddings, code_embeddings))

No sentence-transformers model found with name microsoft/codebert-base. Creating a new one with mean pooling.


Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

In [70]:
import pandas as pd
from bertopic import BERTopic
texts = [f"{q} {c}" for q, c in zip(queries, codes)]
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine', random_state=42, verbose=True)
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', prediction_data=True)
variable_topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=True, verbose=True)
variable_topics, variable_probs = variable_topic_model.fit_transform(texts, embeddings)

2025-05-26 13:49:12,637 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


UMAP(angular_rp_forest=True, metric='cosine', n_components=5, n_jobs=1, random_state=42, verbose=True)
Mon May 26 13:49:12 2025 Construct fuzzy simplicial set
Mon May 26 13:49:13 2025 Finding Nearest Neighbors
Mon May 26 13:49:13 2025 Finished Nearest Neighbor Search
Mon May 26 13:49:13 2025 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Mon May 26 13:49:15 2025 Finished embedding


2025-05-26 13:49:15,444 - BERTopic - Dimensionality - Completed ✓
2025-05-26 13:49:15,445 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-26 13:49:15,520 - BERTopic - Cluster - Completed ✓
2025-05-26 13:49:15,524 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-26 13:49:15,576 - BERTopic - Representation - Completed ✓


In [71]:
import pandas as pd
variable_info = pd.DataFrame(variable_topic_model.get_topic_info())
print(variable_info['Topic'].unique())
variable_info.head()

[-1  0  1  2  3  4  5  6  7  8  9 10 11]


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,34,-1_celsius_fahrenheit_simplehttprequesthandler_32,"[celsius, fahrenheit, simplehttprequesthandler...",[This function converts Celsius to Fahrenheit....
1,0,553,0_return_def_if_this,"[return, def, if, this, code, import, in, to, ...",[Write a function that checks if a number is p...
2,1,91,1_test_assert_function_that,"[test, assert, function, that, unit, returns, ...",[Test a function that checks if a number is ev...
3,2,89,2_numbers_list_import_of,"[numbers, list, import, of, numpy, this, code,...",[This code calculates the mean of a list of nu...
4,3,73,3_os_hostname_the_import,"[os, hostname, the, import, newnametxt, direct...",[Get the current working directory. import os\...


In [79]:
variable_topic_model.visualize_topics()

In [80]:
from typing import List

variable_all_topics: List[int] = list(map(int, list(variable_topic_model.get_topics().keys())))
if -1 in variable_all_topics:
  variable_all_topics.remove(-1)

In [125]:
variable_topic_model.visualize_barchart(topics=variable_all_topics)

In [84]:
from sklearn.cluster import KMeans
from bertopic import BERTopic
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
embeddings = pca.fit_transform(embeddings)
kmeans_model = KMeans(n_clusters=12, random_state=42)
labels = kmeans_model.fit_predict(embeddings)
constant_topic_model = BERTopic(calculate_probabilities=True, verbose=True, nr_topics=13, min_topic_size=15)
constant_topic_model.fit(texts, embeddings, y=labels)

2025-05-26 13:50:53,812 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2025-05-26 13:50:55,710 - BERTopic - Dimensionality - Completed ✓
2025-05-26 13:50:55,712 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-26 13:50:55,816 - BERTopic - Cluster - Completed ✓
2025-05-26 13:50:55,817 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-26 13:50:55,889 - BERTopic - Representation - Completed ✓
2025-05-26 13:50:55,890 - BERTopic - Topic reduction - Reducing number of topics
2025-05-26 13:50:55,898 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-26 13:50:55,945 - BERTopic - Representation - Completed ✓
2025-05-26 13:50:55,947 - BERTopic - Topic reduction - Reduced number of topics from 22 to 13


<bertopic._bertopic.BERTopic at 0x786c0260a9e0>

In [85]:
import pandas as pd
info = pd.DataFrame(constant_topic_model.get_topic_info())
print(info['Topic'].unique())
info.head()

[-1  0  1  2  3  4  5  6  7  8  9 10 11]


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,48,-1_celsius_fahrenheit_httpserver_to,"[celsius, fahrenheit, httpserver, to, 32, usin...",[This function converts Celsius to Fahrenheit....
1,0,201,0_class_def_httpserver_selfendheaders,"[class, def, httpserver, selfendheaders, selfs...",[This Python code creates a simple web server ...
2,1,149,1_image_the_of_code,"[image, the, of, code, return, this, an, def, ...",[Calculate the standard deviation of a list of...
3,2,114,2_assert_test_the_customers,"[assert, test, the, customers, function, selec...",[Test a function that checks if a number is ev...
4,3,101,3_return_if_number_def,"[return, if, number, def, function, else, clas...",[Write a function that checks if a number is p...


In [119]:
constant_topic_model.visualize_topics()

In [120]:
constant_all_topics = list(map(int,list(constant_topic_model.get_topics().keys())))
if -1 in constant_all_topics:
  constant_all_topics.remove(-1)

In [122]:
constant_topic_model.visualize_barchart(top_n_topics=None, topics=constant_all_topics)