In [15]:
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import classification_report
from collections import Counter

In [2]:
!pip install imbalanced-learn


[0m

In [3]:
#internal metrics are generated now and everything is stored in dataframe should be ready for clustering
target_features = [
     'snake_case_var_ratio',
     'snake_case_class_ratio',
     'snake_case_method_ratio',
     'upper_camel_case_var_ratio',
     'upper_camel_case_class_ratio',
     'upper_camel_case_method_ratio',
     'lower_camel_case_var_ratio',
     'lower_camel_case_class_ratio',
     'lower_camel_case_method_ratio',
     'func_decorators_avg',
     'class_decorators_avg',
     'class_parents_avg',
     'comprehensions_avg',
     'generators_avg',
     'lambda_avg',
     'comment_density',
     'ds_density',
]

In [4]:
py150k_df = pd.read_csv("data/py150k_metric_20220527.csv")
bq_df = pd.read_csv("data/bigquery_metric_20220526.csv")

In [5]:
combined_df = pd.concat([py150k_df[target_features], bq_df[target_features]])

In [6]:

with open("data/combined_dataset/clusters/feature_set_1/full_feature_clusterer.pickle", "rb") as file:
    cluster_pred = pickle.load(file)
    labels = cluster_pred.labels_
    cluster_num = len(np.unique(labels))
    X = combined_df[target_features].to_numpy()

In [7]:
from sklearn.linear_model import SGDClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline
# X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
# Y = np.array([1, 1, 2, 2])
# # Always scale the input. The most convenient way is to use a pipeline.
# clf = make_pipeline(StandardScaler(),
#                     SGDClassifier(max_iter=1000, tol=1e-3))
clf = SGDClassifier(max_iter=1000, tol=1e-3)


In [101]:
non_outliers_bool = labels != -1

In [123]:
non_outliers_bool[108479]

False

In [127]:
train_df = combined_df[non_outliers_bool].sample(frac=0.8,random_state=1234)
test_df = combined_df[non_outliers_bool].drop(train_df.index)

In [128]:
train_df.index

Int64Index([108479,  66829,  32656,  47535,  68602,  81263,  80319,  44052,
             20869,  26188,
            ...
             79755,  64659,   6510,  35485,  79206,  40687,   5746,  24823,
             68562,  15191],
           dtype='int64', length=41860)

In [107]:
train_X = train_df[target_features].to_numpy()
test_X = test_df[target_features].to_numpy()

In [108]:
train_y = [labels[idx] for idx in train_df.index]
test_y = [labels[idx] for idx in test_df.index]

array([19, 19, 16, ..., 19,  6,  6])

In [110]:
result = {}

In [37]:

clf = SGDClassifier("hinge",max_iter=1000000, tol=1e-3, verbose=0, early_stopping=True).fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["svm_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["svm_test"] = report.copy()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(train_X, train_y)
print(sorted(Counter(y_resampled).items()))


[(-1, 92), (0, 92), (1, 92), (2, 92), (3, 92), (4, 92), (5, 92), (6, 92), (7, 92), (8, 92), (9, 92), (10, 92), (11, 92), (12, 92), (13, 92), (14, 92), (15, 92), (16, 92), (17, 92), (18, 92), (19, 92), (20, 92), (21, 92), (22, 92), (23, 92), (24, 92), (25, 92)]


In [42]:
# SVM with Undersampling
clf = SGDClassifier("hinge",max_iter=1000000, tol=1e-3, verbose=0, early_stopping=True).fit(X_resampled, y_resampled)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["svm_undersample_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["svm_undersample_test"] = report.copy()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:

clf = SGDClassifier("log_loss",max_iter=1000000, tol=1e-3, verbose=0, early_stopping=True).fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["lr_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["lr_test"] = report.copy()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=17)
clf = clf.fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["tree_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["tree_test"] = report.copy()

In [44]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf = clf.fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["r_forest_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["r_forest_test"] = report.copy()

In [46]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["nb_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["nb_test"] = report.copy()


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
for key in result.keys():
    print(f"{key}:{result[key]['macro avg']['f1-score']}")

svm_train:0.5697227150967134
svm_test:0.3698073019952317
svm_undersample_train:0.5796953254580165
svm_undersample_test:0.45146796185934657
tree_train:0.9963443180407583
tree_test:0.5596028946218314
r_forest_train:1.0
r_forest_test:0.562873645655774
nb_train:0.1270767850882281
nb_test:0.09803099303011674
