In [47]:
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import classification_report
from collections import Counter

In [48]:
%pip install imbalanced-learn


[0mNote: you may need to restart the kernel to use updated packages.


In [49]:
#internal metrics are generated now and everything is stored in dataframe should be ready for clustering
target_features = [
     'snake_case_var_ratio',
     'snake_case_class_ratio',
     'snake_case_method_ratio',
     'upper_camel_case_var_ratio',
     'upper_camel_case_class_ratio',
     'upper_camel_case_method_ratio',
     'lower_camel_case_var_ratio',
     'lower_camel_case_class_ratio',
     'lower_camel_case_method_ratio',
     'func_decorators_avg',
     'class_decorators_avg',
     'class_parents_avg',
     'comprehensions_avg',
     'generators_avg',
     'lambda_avg',
     'comment_density',
     'ds_density',
]

In [50]:
py150k_df = pd.read_csv("data/py150k_metric_20220527.csv")
bq_df = pd.read_csv("data/bigquery_metric_20220526.csv")

In [51]:
combined_df = pd.concat([py150k_df[target_features], bq_df[target_features]])

In [52]:

with open("data/combined_dataset/clusters/feature_set_1/full_feature_clusterer.pickle", "rb") as file:
    cluster_pred = pickle.load(file)
    labels = cluster_pred.labels_
    cluster_num = len(np.unique(labels))
    X = combined_df[target_features].to_numpy()

In [96]:
from collections import OrderedDict, Counter
OrderedDict(sorted(Counter(labels).items()))

OrderedDict([(-1, 162725),
             (0, 560),
             (1, 862),
             (2, 549),
             (3, 520),
             (4, 929),
             (5, 1243),
             (6, 3620),
             (7, 4953),
             (8, 1338),
             (9, 1070),
             (10, 5791),
             (11, 2724),
             (12, 573),
             (13, 796),
             (14, 798),
             (15, 879),
             (16, 2733),
             (17, 1139),
             (18, 722),
             (19, 11951),
             (20, 536),
             (21, 620),
             (22, 1093),
             (23, 3241),
             (24, 1360),
             (25, 1725)])

In [53]:
from sklearn.linear_model import SGDClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline
# X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
# Y = np.array([1, 1, 2, 2])
# # Always scale the input. The most convenient way is to use a pipeline.
# clf = make_pipeline(StandardScaler(),
#                     SGDClassifier(max_iter=1000, tol=1e-3))
clf = SGDClassifier(max_iter=1000, tol=1e-3)


In [54]:
non_outliers_bool = labels != -1

In [55]:
non_outliers_bool[108479]

False

In [56]:
train_df = combined_df[non_outliers_bool].sample(frac=0.8,random_state=1234)
test_df = combined_df[non_outliers_bool].drop(train_df.index)

In [57]:
train_df.index

Int64Index([108479,  66829,  32656,  47535,  68602,  81263,  80319,  44052,
             20869,  26188,
            ...
             79755,  64659,   6510,  35485,  79206,  40687,   5746,  24823,
             68562,  15191],
           dtype='int64', length=41860)

In [58]:
train_X = train_df[target_features].to_numpy()
test_X = test_df[target_features].to_numpy()


In [59]:
#issue here with the indexes since they are not reset after concatting
train_y = [labels[idx] for idx in train_df.index]
test_y = [labels[idx] for idx in test_df.index]
display(combined_df)

Unnamed: 0,snake_case_var_ratio,snake_case_class_ratio,snake_case_method_ratio,upper_camel_case_var_ratio,upper_camel_case_class_ratio,upper_camel_case_method_ratio,lower_camel_case_var_ratio,lower_camel_case_class_ratio,lower_camel_case_method_ratio,func_decorators_avg,class_decorators_avg,class_parents_avg,comprehensions_avg,generators_avg,lambda_avg,comment_density,ds_density
0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.017241,0.000000
1,0.321321,0.0,0.547619,0.042042,1.0,0.0,0.003003,0.0,0.0,0.285714,0.0,1.0,0.000000,0.002755,0.0,0.066116,0.000000
2,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.076923,0.000000
3,0.307692,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000
4,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.090909,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115045,0.053463,0.0,0.500000,0.010936,0.0,0.0,0.004253,0.0,0.0,0.000000,0.0,0.0,0.008514,0.000000,0.0,0.155573,0.137771
115046,0.087719,0.0,0.750000,0.052632,0.5,0.0,0.000000,0.0,0.0,0.500000,0.0,1.0,0.000000,0.000000,0.0,0.087629,0.000000
115047,0.191919,0.0,1.000000,0.040404,1.0,0.0,0.000000,0.0,0.0,1.181818,0.0,1.0,0.000000,0.000000,0.0,0.006897,0.131034
115048,0.021277,0.0,0.800000,0.042553,1.0,0.0,0.000000,0.0,0.2,0.000000,0.0,1.0,0.000000,0.000000,0.0,0.000000,0.000000


In [78]:
# corrected label setup
combined_df['labels'] = labels
# filtered_df = combined_df[combined_df['labels'] != -1]
filtered_df = combined_df

train_df = filtered_df.sample(frac=0.8,random_state=1234)
test_df = filtered_df.drop(train_df.index)

train_X = train_df[target_features].to_numpy()
test_X = test_df[target_features].to_numpy()

# train_y = train_df['labels'].to_list()
# test_y = test_df['labels'].to_list()

train_y = (train_df['labels'] > -1).to_list()
test_y = (test_df['labels'] > -1).to_list()

train_df.index


Int64Index([ 90959,  69971,   5122,  10347,  99287,  73021, 110785,  33661,
             76301,  90186,
            ...
            105777,   5208,  29416,  93129,  23481,  34493,  45220, 107343,
             45036,  51050],
           dtype='int64', length=172040)

In [79]:
result = {}

In [80]:

clf = SGDClassifier("hinge",max_iter=1000000, tol=1e-3, verbose=0, early_stopping=True, class_weight='balanced').fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["svm_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["svm_test"] = report.copy()

In [84]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(train_X, train_y)
print(sorted(Counter(y_resampled).items()))


[(False, 41859), (True, 41859)]


In [85]:
# SVM with Undersampling
clf = SGDClassifier("hinge",max_iter=1000000, tol=1e-3, verbose=0, early_stopping=True).fit(X_resampled, y_resampled)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["svm_undersample_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["svm_undersample_test"] = report.copy()

In [86]:
from sklearn.linear_model import LogisticRegression
#clf = SGDClassifier("log_loss",max_iter=1000000, tol=0.0001, verbose=0, early_stopping=True, class_weight = 'balanced').fit(train_X, train_y)
clf = LogisticRegression(max_iter=100000, tol = 0.0001, class_weight = 'balanced', C = 1).fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["lr_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["lr_test"] = report.copy()

In [87]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=17)
clf = clf.fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["tree_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["tree_test"] = report.copy()

In [88]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf = clf.fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["r_forest_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["r_forest_test"] = report.copy()

In [89]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["nb_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["nb_test"] = report.copy()


In [90]:
for key in result.keys():
    print(f"{key}:{result[key]['macro avg']['f1-score']}")

svm_train:0.7221629008151754
svm_test:0.7189312239520728
svm_undersample_train:0.7181532241063929
svm_undersample_test:0.7146439246585422
lr_train:0.7000547220064317
lr_test:0.695155369256041
tree_train:0.9896754433015509
tree_test:0.9732098250198287
r_forest_train:0.9999842145870721
r_forest_test:0.9844678364584223
nb_train:0.43062147117033034
nb_test:0.43017223490350703


In [91]:
result

{'svm_train': {'False': {'precision': 0.8862400107447451,
   'recall': 0.8109939238444934,
   'f1-score': 0.8469489669446348,
   'support': 130181},
  'True': {'precision': 0.5349826126398548,
   'recall': 0.6762464464034019,
   'f1-score': 0.597376834685716,
   'support': 41859},
  'accuracy': 0.7782085561497326,
  'macro avg': {'precision': 0.7106113116923,
   'recall': 0.7436201851239477,
   'f1-score': 0.7221629008151754,
   'support': 172040},
  'weighted avg': {'precision': 0.8007756801979385,
   'recall': 0.7782085561497326,
   'f1-score': 0.7862256474594798,
   'support': 172040}},
 'svm_test': {'False': {'precision': 0.8803952158086323,
   'recall': 0.8161002651241263,
   'f1-score': 0.8470293933708567,
   'support': 8298},
  'True': {'precision': 0.5364520048602673,
   'recall': 0.6574832464631423,
   'f1-score': 0.5908330545332888,
   'support': 2686},
  'accuracy': 0.7773124544792426,
  'macro avg': {'precision': 0.7084236103344499,
   'recall': 0.7367917557936343,
   'f1-s