In [1]:
import time
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

import numpy as np
import hdbscan

import requests
import json
import time
from tqdm.auto import tqdm

# from tsnecuda import TSNE
# install T-SNE with cuda => 10x faster, highly recommended if running in CUDA machine
# !conda install tsnecuda -c conda-forge -y

# !pip install hdbscan

In [2]:
user_type_map = {}
with open('user_type_map.json') as json_file:
    user_type_map = json.load(json_file)

In [3]:
def get_user_repo_feat(data_df):
    return data_df.apply(
        lambda row: f"{row['user_name']}/{row['repo_name']}", axis=1
    ).tolist()


def get_watch_count_feat(data_df):
    watch_count = (
        pd.read_csv("../data/watch_count_public_data.csv")
        .rename(columns={"repo_name": "user/repo"})
        .drop(columns="Record Count")
    )
    return data_df.join(watch_count.set_index("user/repo"), on="user/repo")[
        "watch_count"
    ].tolist()



def get_user_repo_by_file_name(file_name):
    # get the repo and username from script file name
    script_file_name_regex = re.compile(r"data/([^/]+)/([^/]+)/.+")
    match = script_file_name_regex.search(file_name)
    username = match.group(1)
    repo_name = match.group(2)
    return username, repo_name

def read_py150k_code(filename, limit=None):
    filenames = []
    with open(filename, "r") as f:
        while True:
            line = f.readline()
            if not line:
                break
            filenames += [line.strip()]
            if limit and len(filenames) >= limit:
                break

    return filenames

PY150K_TRAIN_CODE = "../data/py150/py150_files/python100k_train.txt"

In [4]:
def get_high_freq_users(user_freq_dist, freq_thresh = 1000):
    users = []
    for user, freq in user_freq_dist:
        if freq < freq_thresh: continue
        users += [user]
    return users

In [5]:
import re

# sample code for loading Py150k code
code_filenames = read_py150k_code(PY150K_TRAIN_CODE)
u_list = []
r_list = []
for name in code_filenames:
    username, repo_name = get_user_repo_by_file_name(name)
    u_list += [username]
    r_list += [repo_name]

In [6]:
data = pd.read_csv('../data/py150k_metrics.csv')

In [7]:
train_data_df = data.copy()
train_data_df["user_name"] = u_list
train_data_df["repo_name"] = r_list
train_data_df["user/repo"] = get_user_repo_feat(train_data_df)
train_data_df["user_type"] = [user_type_map[u] for u in u_list]
train_data_df["watch_count"] = get_watch_count_feat(train_data_df)
train_plot_df = train_data_df.dropna(subset=["watch_count"])
train_data_df_user_only = train_data_df[train_data_df["user_type"] == "User"]
train_data_df_org_only = train_data_df[train_data_df["user_type"] == "Organization"]

In [8]:
def random_sample_clusters(sample_df, labels):
    sample_size = len(sample_df)
    labels_cat = np.unique(labels)
    prob_dist = {k:v/sample_size for k,v in dict(Counter(labels)).items()}
    results = []
    return np.random.choice(labels_cat, sample_size, p=[prob_dist[l] for l in labels_cat])


In [9]:
full_rules = [
 'snake_case_var_ratio',
 'snake_case_class_ratio',
 'snake_case_method_ratio',
 'upper_camel_case_var_ratio',
 'upper_camel_case_class_ratio',
 'upper_camel_case_method_ratio',
 'lower_camel_case_var_ratio',
 'lower_camel_case_class_ratio',
 'lower_camel_case_method_ratio',
 'func_decorators_avg',
 'class_decorators_avg',
 'class_parents_avg',
 'comprehensions_avg',
 'generators_avg',
 'lambda_avg',
 'comment_density',
 'ds_density',
]
len(full_rules)

17

In [10]:
def get_filtered_columns(data, rules):
    columns = data.columns
    results = set()
    for col in columns:
        match_flag = True
        
        for rule in rules:
            match_flag = rule in col and match_flag
        
        if match_flag:    
            results.add(col)
    return results

id_subset = get_filtered_columns(data, ["case"]).union(get_filtered_columns(data, ["id"]))
case_ratio_subset = get_filtered_columns(data, ["case", "ratio"])
normalized_subset = get_filtered_columns(data, ["ratio"])
density_subset = get_filtered_columns(data, ["density"])
full_subset = set()
for rule in full_rules:
    full_subset = full_subset.union(get_filtered_columns(data, [rule]))

In [11]:
from collections import Counter

def purity(cluster_labels, gold_labels):
    cluster_num = np.unique(cluster_labels).shape[0]
    cluster_sets = {}
    for cluster_label in np.unique(cluster_labels):
        cluster_sets[cluster_label] = []
    
    for idx, cluster_label in enumerate(cluster_labels):
        cluster_sets[cluster_label].append(gold_labels[idx])
    
    total_correct_pred = 0
    for c in np.unique(cluster_labels):
        label_counter = Counter(cluster_sets[c])
        pred_label, _ = label_counter.most_common(1)[0]
        total_correct_pred += label_counter[pred_label]
    
    purity = total_correct_pred / len(cluster_labels)
    
    return purity


In [12]:
from collections import Counter
user_counter = Counter(train_data_df_user_only["user_name"])
user_freq_dist = user_counter.most_common(20)[:]
selected_users = get_high_freq_users(user_freq_dist, 0)
user_to_idx = {user:idx for idx, user in enumerate(selected_users)}
user_bool = train_data_df['user_name'].apply(lambda user: user in selected_users)
user_freq_dist

[('tav', 852),
 ('anandology', 435),
 ('CollabQ', 397),
 ('kuri65536', 391),
 ('azoft-dev-team', 329),
 ('jmcnamara', 309),
 ('RoseOu', 234),
 ('cool-RR', 229),
 ('dcramer', 221),
 ('kayhayen', 211),
 ('babble', 200),
 ('rwl', 199),
 ('daviddrysdale', 197),
 ('nlloyd', 194),
 ('powdahound', 181),
 ('Akagi201', 157),
 ('benoitc', 150),
 ('lsaffre', 149),
 ('amrdraz', 147),
 ('spulec', 145)]

In [13]:
sample = train_data_df[user_bool].loc[:]
author_class = sample["user_name"].apply(lambda user: user_to_idx[user]).to_numpy()


subset = full_subset
X = sample[subset]

len(sample), len(sample.columns)

  X = sample[subset]


(5327, 87)

In [14]:
baseline_clusters = random_sample_clusters(sample, author_class)
purity(baseline_clusters, author_class)

0.160503097428196

In [15]:
# External Evaluation of Purity on Author

min_samples_sizes = [50, 100, 500, 1000]
min_cluster_sizes = [50, 100, 500, 1000]
epslons = [0.01, 0.05, 0.1, 0.5]#np.linspace(0, 1, 3, endpoint=True)
min_samples_params = []
min_cluster_size_params = []
epsilon_params = []
cluster_nums = []
purities = []
for e in epslons:
    for m in min_samples_sizes:
        for n in min_cluster_sizes:
            ep = round(float(e),2)
            clusterer = hdbscan.HDBSCAN(min_samples=int(m), min_cluster_size=int(n), cluster_selection_epsilon = ep, prediction_data=True).fit(X.to_numpy())
            cluster_num = np.unique(clusterer.labels_).shape[0]
            p = purity(clusterer.labels_, author_class)
            
            print("min_samples", int(m))
            print("min_cluster_size", int(n))
            print("epsilon", ep)
            print("cluster num", cluster_num)
            print("purity: ", p)
            print("========================")
            
            min_samples_params += [int(m)]
            min_cluster_size_params += [int(n)]
            epsilon_params += [ep]
            cluster_nums += [cluster_num]
            purities += [p]
            
pd.DataFrame({
    "min_sample": min_samples_params,
    "min_cluster_size": min_cluster_size_params,
    "epsilon": epsilon_params,
    "cluster_num": cluster_nums,
    "purity": purities,
}).to_csv("purity_author_only.csv", index=False)

min_samples 50
min_cluster_size 50
epsilon 0.01
cluster num 10
purity:  0.25098554533508544
min_samples 50
min_cluster_size 100
epsilon 0.01
cluster num 6
purity:  0.2348413741317815
min_samples 50
min_cluster_size 500
epsilon 0.01
cluster num 3
purity:  0.15993992866529003
min_samples 50
min_cluster_size 1000
epsilon 0.01
cluster num 3
purity:  0.15993992866529003
min_samples 100
min_cluster_size 50
epsilon 0.01
cluster num 6
purity:  0.2370940491834053
min_samples 100
min_cluster_size 100
epsilon 0.01
cluster num 6
purity:  0.2370940491834053
min_samples 100
min_cluster_size 500
epsilon 0.01
cluster num 3
purity:  0.15993992866529003
min_samples 100
min_cluster_size 1000
epsilon 0.01
cluster num 3
purity:  0.15993992866529003
min_samples 500
min_cluster_size 50
epsilon 0.01
cluster num 3
purity:  0.15993992866529003
min_samples 500
min_cluster_size 100
epsilon 0.01
cluster num 3
purity:  0.15993992866529003
min_samples 500
min_cluster_size 500
epsilon 0.01
cluster num 3
purity:  0.15

In [16]:
org_counter = Counter(train_data_df_org_only["user_name"])
org_freq_dist = org_counter.most_common(20)[:]
selected_orgs = get_high_freq_users(org_freq_dist, 0)
org_to_idx = {org:idx for idx, org in enumerate(selected_orgs)}
org_bool = train_data_df['user_name'].apply(lambda org: org in selected_orgs)
org_freq_dist

[('anhstudios', 5069),
 ('openstack', 3111),
 ('AppScale', 1898),
 ('cloudera', 1033),
 ('dropbox', 805),
 ('dimagi', 785),
 ('mozilla', 753),
 ('GoogleCloudPlatform', 712),
 ('enthought', 700),
 ('google', 621),
 ('django', 606),
 ('saltstack', 593),
 ('BU-NU-CLOUD-SP16', 501),
 ('getsentry', 439),
 ('Azure', 438),
 ('CenterForOpenScience', 433),
 ('StackStorm', 417),
 ('fp7-ofelia', 343),
 ('freenas', 325),
 ('sympy', 315)]

In [17]:
sample = train_data_df[org_bool].loc[:]
org_class = sample["user_name"].apply(lambda org: org_to_idx[org]).to_numpy()


subset = full_subset
X = sample[subset]

len(sample), len(sample.columns)

  X = sample[subset]


(19897, 87)

In [18]:
baseline_clusters = random_sample_clusters(sample, org_class)
purity(baseline_clusters, org_class)

0.2547620244257928

In [19]:
# External Evaluation of Purity on Author

min_samples_sizes = [50, 100, 500, 1000]
min_cluster_sizes = [50, 100, 500, 1000]
epslons = [0.01, 0.05, 0.1, 0.5]#np.linspace(0, 1, 3, endpoint=True)
min_samples_params = []
min_cluster_size_params = []
epsilon_params = []
cluster_nums = []
purities = []
for e in epslons:
    for m in min_samples_sizes:
        for n in min_cluster_sizes:
            ep = round(float(e),2)
            clusterer = hdbscan.HDBSCAN(min_samples=int(m), min_cluster_size=int(n), cluster_selection_epsilon = ep, prediction_data=True).fit(X.to_numpy())
            cluster_num = np.unique(clusterer.labels_).shape[0]
            p = purity(clusterer.labels_, org_class)
            
            print("min_samples", int(m))
            print("min_cluster_size", int(n))
            print("epsilon", ep)
            print("cluster num", cluster_num)
            print("purity: ", p)
            print("========================")
            
            min_samples_params += [int(m)]
            min_cluster_size_params += [int(n)]
            epsilon_params += [ep]
            cluster_nums += [cluster_num]
            purities += [p]
            
pd.DataFrame({
    "min_sample": min_samples_params,
    "min_cluster_size": min_cluster_size_params,
    "epsilon": epsilon_params,
    "cluster_num": cluster_nums,
    "purity": purities,
}).to_csv("purity_org_only.csv", index=False)

min_samples 50
min_cluster_size 50
epsilon 0.01
cluster num 18
purity:  0.42820525707393076
min_samples 50
min_cluster_size 100
epsilon 0.01
cluster num 9
purity:  0.4213197969543147
min_samples 50
min_cluster_size 500
epsilon 0.01
cluster num 5
purity:  0.41061466552746645
min_samples 50
min_cluster_size 1000
epsilon 0.01
cluster num 5
purity:  0.41061466552746645
min_samples 100
min_cluster_size 50
epsilon 0.01
cluster num 9
purity:  0.42116902045534504
min_samples 100
min_cluster_size 100
epsilon 0.01
cluster num 9
purity:  0.42116902045534504
min_samples 100
min_cluster_size 500
epsilon 0.01
cluster num 5
purity:  0.41006181836457756
min_samples 100
min_cluster_size 1000
epsilon 0.01
cluster num 5
purity:  0.41006181836457756
min_samples 500
min_cluster_size 50
epsilon 0.01
cluster num 5
purity:  0.4127757953460321
min_samples 500
min_cluster_size 100
epsilon 0.01
cluster num 5
purity:  0.4127757953460321
min_samples 500
min_cluster_size 500
epsilon 0.01
cluster num 5
purity:  0.41

In [35]:
# user_type_map = {}
# with open('user_type_map.json') as json_file:
#     user_type_map = json.load(json_file)

In [76]:
# # auth = ("cting3", "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")
# auth = ("ken-daohuei", "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")

In [77]:
# for idx in tqdm(range(len(all_users))):
#     if idx % 500 == 0:
#         with open("user_type_map.json", "w") as outfile:
#             json.dump(user_type_map, outfile, indent=4)
            
#     user = all_users[idx]
#     if user in user_type_map.keys(): continue
#     while True:
#         response = requests.get(f"https://api.github.com/users/{all_users[idx]}", auth=auth)
#         if response.status_code != 403:
#             data_dict = json.loads(response.text)
#             if "type" in data_dict.keys():
#                 user_type_map[all_users[idx]] = data_dict["type"]
#             else:
#                 user_type_map[all_users[idx]] = None
#             break
#         print(response.text)
#         time.sleep(30)
#     time.sleep(0.01)
    
# with open("user_type_map.json", "w") as outfile:
#     json.dump(user_type_map, outfile, indent=4)


  0%|          | 0/5632 [00:00<?, ?it/s]