In [55]:
from sweepai.config.client import SweepConfig
from sweepai.core.lexical_search import prepare_lexical_search_index
from sweepai.utils.github_utils import ClonedRepo, MockClonedRepo


cloned_repo = MockClonedRepo(
    "/tmp/sweep",
    "sweepai/sweep",
)

_, snippets, lexical_index = prepare_lexical_search_index(
    cloned_repo.cached_dir,
    SweepConfig(),
)

In [56]:
from sweepai.core.lexical_search import SNIPPET_FORMAT
from sweepai.core.vector_db import embed_text_array

snippet_formats = [
    SNIPPET_FORMAT.format(
        file_path=snippet.file_path,
        contents=snippet.get_snippet(add_ellipsis=False, add_lines=False),
    ) for snippet in snippets
]

embeddings = embed_text_array(snippet_formats)

In [57]:
import numpy as np

embeddings = np.concatenate(embeddings, axis=0)

In [58]:
embeddings, embeddings.shape

In [59]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Clustering
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Dimensionality Reduction
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(embeddings)

# Visualization
plt.figure(figsize=(10, 8))
scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=clusters, cmap='viridis', alpha=0.6)
plt.colorbar(scatter)
plt.title('t-SNE visualization of Embeddings Clustered by K-Means')
plt.show()

In [60]:
cluster_indices = {}
for i in range(kmeans.n_clusters):
    cluster_indices[i] = np.where(clusters == i)[0]

for cluster, indices in cluster_indices.items():
    print(f"Cluster {cluster}")
    count = 0
    vis_file_path = set()
    for index in indices:
        if count == 20:
            break
        if snippets[index].file_path in vis_file_path:
            continue
        vis_file_path.add(snippets[index].file_path)
        print(snippets[index].file_path.removeprefix(cloned_repo._repo_dir + "/"))
        count += 1
    print("\n")
    

In [71]:
import os

def count_files(directory):
    total_count = 0
    for root, dirs, files in os.walk(directory):
        total_count += len(files)
    return total_count

def find_large_subdirs(directory, file_threshold):
    large_subdirs = []
    for root, dirs, files in os.walk(directory):
        if ".git" in root.split(os.sep):
            continue
        subdir_file_count = count_files(root)
        if subdir_file_count > file_threshold:
            large_subdirs.append(root)
    large_subdirs.remove(directory)
    return large_subdirs


# Usage: Specify the directory path and the file threshold
directory_path = cloned_repo._repo_dir
file_threshold = 10
large_subdirectories = find_large_subdirs(directory_path, file_threshold)
print("Subdirectories with more than 20 files:", large_subdirectories)

In [74]:
large_subdirectory = large_subdirectories[0]
snippets_in_subdir = []
embeddings_in_subdir = []

for i, snippet in enumerate(snippets):
    if snippet.file_path.startswith(large_subdirectory):
        snippets_in_subdir.append(snippet)
        embeddings_in_subdir.append(embeddings[i])

print(snippets_in_subdir)
print(embeddings_in_subdir)


In [82]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

def select_diverse_vectors(vectors, num_vectors=5):
    # Start with a random vector
    selected_indices = [np.random.randint(len(vectors))]
    selected_vectors = [vectors[selected_indices[0]]]
    
    for _ in range(1, num_vectors):
        max_min_distance = 0
        next_index = -1
        
        for i in range(len(vectors)):
            if i in selected_indices:
                continue
            
            # Calculate minimum distance to all selected vectors
            distances = cosine_distances([vectors[i]], selected_vectors)[0]
            min_distance = np.min(distances)
            
            # Select the vector that maximizes the minimum distance
            if min_distance > max_min_distance:
                max_min_distance = min_distance
                next_index = i
        
        selected_indices.append(next_index)
        selected_vectors.append(vectors[next_index])
    
    return selected_indices

# Example usage
# Assuming 'embeddings' is your list of embeddings
indices = select_diverse_vectors(embeddings_in_subdir)
diverse_snippets = [snippets_in_subdir[i] for i in indices]
diverse_vectors = [embeddings_in_subdir[i] for i in indices]
print(diverse_snippets)

In [89]:
from sweepai.core.vector_db import cosine_similarity


def find_central_vectors(vectors, num_vectors=5):
    # Calculate the centroid of all vectors
    centroid = np.mean(vectors, axis=0)
    
    # Compute cosine similarities between each vector and the centroid
    similarities = cosine_similarity(vectors, [centroid]).flatten()
    
    # Get indices of the top 'num_vectors' vectors with highest similarity
    central_indices = np.argsort(-similarities)[:num_vectors]
    
    return central_indices

# Example usage
# Assuming 'embeddings' is your list of embeddings
central_indices = find_central_vectors(np.array(embeddings_in_subdir))
central_snippets = [snippets[i] for i in central_indices]
central_vectors = [embeddings[i] for i in central_indices]
for snippet in central_snippets:
    print(snippet.denotation + "\n\n")
    print(snippet.get_snippet(False, False))

In [118]:
def count_descendants(directory):
    descendant_count = {}

    def dfs(current_dir):
        count = 0
        for root, dirs, files in os.walk(current_dir):
            if ".git" in root.split(os.sep):
                continue
            for d in dirs:
                if ".git" in root.split(os.sep):
                    continue
                count += dfs(os.path.join(root, d))
            count += len(files)
        descendant_count[current_dir.removeprefix(directory.rstrip() + "/")] = count
        return count

    dfs(directory)
    for key in (".git", "", directory):
        if key in descendant_count:
            del descendant_count[key]
    return descendant_count

directory_path = cloned_repo._repo_dir
descendants = count_descendants(directory_path)
print(descendants)

In [119]:
def print_sorted_descendants(directory):
    descendants = count_descendants(directory)
    # Sort directories by the number of files, from highest to lowest
    sorted_descendants = sorted(descendants.items(), key=lambda item: item[1], reverse=True)
    for dir_path, count in sorted_descendants:
        print(f"{dir_path}: {count}")
print("Sorted descendants:")
print_sorted_descendants(directory_path)

In [135]:
def plot_histogram(directory):
    file_counts = count_descendants(directory)
    values = list(file_counts.values())

    bins = list(range(0, 101))  # Bins from 0 to 100
    bins.append(max(values) + 1)  # Additional bin for all values greater than 100

    plt.figure(figsize=(10, 6))
    plt.hist(values, bins=100, alpha=0.7, color='blue', edgecolor='black')
    plt.title('Histogram of File Counts in Directories')
    plt.xlabel('Number of Files')
    plt.ylabel('Number of Directories')
    plt.xlim(0, 100)
    
    plt.grid(True)
    plt.show()

# Example usage:
directory_path = cloned_repo._repo_dir
plot_histogram(directory_path)