In [3]:
# install T-SNE with cuda => 10x faster, highly recommended
# !conda install tsnecuda -c conda-forge -y

In [4]:
import time
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tsnecuda import TSNE


In [5]:

data = pd.read_csv('../data/py150k_metrics.csv')
sample = data.loc[:10]

In [6]:
def get_filtered_columns(data, rules):
    columns = data.columns
    results = []
    for col in columns:
        match_flag = True
        
        for rule in rules:
            match_flag = rule in col and match_flag
        
        if match_flag:    
            results.append(col)
    return results

id_subset = get_filtered_columns(data, ["case"]) + get_filtered_columns(data, ["id"])
case_ratio_subset = get_filtered_columns(data, ["case", "ratio"])
normalized_subset = get_filtered_columns(data, ["ratio"])
sample_subset = get_filtered_columns(data, ["ratio", "case", "method"])

In [None]:
subset = case_ratio_subset
X = sample[subset]
print(X.shape)
start_time = time.time()
projection = TSNE(verbose=1).fit_transform(X)
plt.scatter(*projection.T)
end_time = time.time()
print(end_time - start_time)

(11, 24)


In [None]:
import time
import numpy as np
import hdbscan

In [None]:
start_time = time.time()
clusterer = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=10 , cluster_selection_epsilon = 0, prediction_data=True).fit(X.to_numpy())
print(len(clusterer.labels_))
end_time = time.time()

color_palette = sns.color_palette('Paired', 110000)
print(len(color_palette))

cluster_colors = [color_palette[x] if x >= 0
                  else (0.5, 0.5, 0.5)
                  for x in clusterer.labels_]
cluster_member_colors = [sns.desaturate(x, p) for x, p in
                         zip(cluster_colors, clusterer.probabilities_)]
plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
end_time - start_time

In [None]:
import numpy as np

np.unique(clusterer.labels_).shape

In [None]:
import re
from tqdm import tqdm
import pandas as pd
from utils.helper import read_py150k_code
from config import PY150K_TRAIN_CODE

code_filenames = read_py150k_code(PY150K_TRAIN_CODE)
script_file_name_regex = re.compile(r"data/([^/]+)/([^/]+)/.+")
users = []
repos = []
for code_filename in tqdm(code_filenames):
    match = script_file_name_regex.search(code_filename)
    if not match:
        print(file_name)
    code_filename
    users += [match.group(1)]
    repos += [match.group(2)]
    
sample["user"] = users
sample["user"] = sample["user"].astype("category")


color_palette = sns.color_palette('Paired', 110000)
print(len(color_palette))

cluster_colors = [color_palette[x] for x in sample["user"].cat.codes]
cluster_member_colors = [sns.desaturate(x, p) for x, p in
                         zip(cluster_colors, clusterer.probabilities_)]
plt.scatter(*projection.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)