In [24]:
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import classification_report
from collections import Counter

In [25]:
%pip install imbalanced-learn


Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\km201\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [26]:
#internal metrics are generated now and everything is stored in dataframe should be ready for clustering
target_features = [
     'snake_case_var_ratio',
     'snake_case_class_ratio',
     'snake_case_method_ratio',
     'upper_camel_case_var_ratio',
     'upper_camel_case_class_ratio',
     'upper_camel_case_method_ratio',
     'lower_camel_case_var_ratio',
     'lower_camel_case_class_ratio',
     'lower_camel_case_method_ratio',
     'func_decorators_avg',
     'class_decorators_avg',
     'class_parents_avg',
     'comprehensions_avg',
     'generators_avg',
     'lambda_avg',
     'comment_density',
     'ds_density',
]

In [27]:
py150k_df = pd.read_csv("data/py150k_metric_20220527.csv")
bq_df = pd.read_csv("data/bigquery_metric_20220526.csv")

In [28]:
combined_df = pd.concat([py150k_df[target_features], bq_df[target_features]])

In [29]:

with open("data/combined_dataset/clusters/feature_set_1/full_feature_clusterer.pickle", "rb") as file:
    cluster_pred = pickle.load(file)
    labels = cluster_pred.labels_
    cluster_num = len(np.unique(labels))
    X = combined_df[target_features].to_numpy()

In [30]:
from sklearn.linear_model import SGDClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline
# X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
# Y = np.array([1, 1, 2, 2])
# # Always scale the input. The most convenient way is to use a pipeline.
# clf = make_pipeline(StandardScaler(),
#                     SGDClassifier(max_iter=1000, tol=1e-3))
clf = SGDClassifier(max_iter=1000, tol=1e-3)


In [31]:
non_outliers_bool = labels != -1

In [32]:
non_outliers_bool[108479]

False

In [33]:
train_df = combined_df[non_outliers_bool].sample(frac=0.8,random_state=1234)
test_df = combined_df[non_outliers_bool].drop(train_df.index)

In [34]:
train_df.index

Int64Index([108479,  66829,  32656,  47535,  68602,  81263,  80319,  44052,
             20869,  26188,
            ...
             79755,  64659,   6510,  35485,  79206,  40687,   5746,  24823,
             68562,  15191],
           dtype='int64', length=41860)

In [35]:
train_X = train_df[target_features].to_numpy()
test_X = test_df[target_features].to_numpy()


In [36]:
#issue here with the indexes since they are not reset after concatting
train_y = [labels[idx] for idx in train_df.index]
test_y = [labels[idx] for idx in test_df.index]
display(combined_df)

Unnamed: 0,snake_case_var_ratio,snake_case_class_ratio,snake_case_method_ratio,upper_camel_case_var_ratio,upper_camel_case_class_ratio,upper_camel_case_method_ratio,lower_camel_case_var_ratio,lower_camel_case_class_ratio,lower_camel_case_method_ratio,func_decorators_avg,class_decorators_avg,class_parents_avg,comprehensions_avg,generators_avg,lambda_avg,comment_density,ds_density
0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.017241,0.000000
1,0.321321,0.0,0.547619,0.042042,1.0,0.0,0.003003,0.0,0.0,0.285714,0.0,1.0,0.000000,0.002755,0.0,0.066116,0.000000
2,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.076923,0.000000
3,0.307692,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000
4,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.090909,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115045,0.053463,0.0,0.500000,0.010936,0.0,0.0,0.004253,0.0,0.0,0.000000,0.0,0.0,0.008514,0.000000,0.0,0.155573,0.137771
115046,0.087719,0.0,0.750000,0.052632,0.5,0.0,0.000000,0.0,0.0,0.500000,0.0,1.0,0.000000,0.000000,0.0,0.087629,0.000000
115047,0.191919,0.0,1.000000,0.040404,1.0,0.0,0.000000,0.0,0.0,1.181818,0.0,1.0,0.000000,0.000000,0.0,0.006897,0.131034
115048,0.021277,0.0,0.800000,0.042553,1.0,0.0,0.000000,0.0,0.2,0.000000,0.0,1.0,0.000000,0.000000,0.0,0.000000,0.000000


In [37]:
# corrected label setup
combined_df['labels'] = labels
filtered_df = combined_df[combined_df['labels'] != -1]
#filtered_df = combined_df

train_df = filtered_df.sample(frac=0.8,random_state=1234)
test_df = filtered_df.drop(train_df.index)

train_X = train_df[target_features].to_numpy()
test_X = test_df[target_features].to_numpy()

train_y = train_df['labels'].to_list()
test_y = test_df['labels'].to_list()
train_df.index


Int64Index([108479,  66829,  32656,  47535,  68602,  81263,  80319,  44052,
             20869,  26188,
            ...
             79755,  64659,   6510,  35485,  79206,  40687,   5746,  24823,
             68562,  15191],
           dtype='int64', length=41860)

In [38]:
result = {}

In [39]:

clf = SGDClassifier("hinge",max_iter=1000000, tol=1e-3, verbose=0, early_stopping=True, class_weight='balanced').fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["svm_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["svm_test"] = report.copy()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(train_X, train_y)
print(sorted(Counter(y_resampled).items()))


[(0, 424), (1, 424), (2, 424), (3, 424), (4, 424), (5, 424), (6, 424), (7, 424), (8, 424), (9, 424), (10, 424), (11, 424), (12, 424), (13, 424), (14, 424), (15, 424), (16, 424), (17, 424), (18, 424), (19, 424), (20, 424), (21, 424), (22, 424), (23, 424), (24, 424), (25, 424)]


In [41]:
# SVM with Undersampling
clf = SGDClassifier("hinge",max_iter=1000000, tol=1e-3, verbose=0, early_stopping=True).fit(X_resampled, y_resampled)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["svm_undersample_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["svm_undersample_test"] = report.copy()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
from sklearn.linear_model import LogisticRegression
#clf = SGDClassifier("log_loss",max_iter=1000000, tol=0.0001, verbose=0, early_stopping=True, class_weight = 'balanced').fit(train_X, train_y)
clf = LogisticRegression(max_iter=100000, tol = 0.0001, class_weight = 'balanced', C = 1).fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["lr_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["lr_test"] = report.copy()

In [43]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=17)
clf = clf.fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["tree_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["tree_test"] = report.copy()

In [44]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf = clf.fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["r_forest_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["r_forest_test"] = report.copy()

In [45]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_X, train_y)

preds = clf.predict(train_X)
report = classification_report(train_y, preds, output_dict=True)
result["nb_train"] = report.copy()

preds = clf.predict(test_X)
report = classification_report(test_y, preds, output_dict=True)
result["nb_test"] = report.copy()


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
for key in result.keys():
    print(f"{key}:{result[key]['macro avg']['f1-score']}")

svm_train:0.8596022096988769
svm_test:0.8601328672265823
svm_undersample_train:0.8771153321395241
svm_undersample_test:0.8816456921909667
lr_train:0.973900283649032
lr_test:0.9725762281494122
tree_train:1.0
tree_test:0.9989483358539175
r_forest_train:1.0
r_forest_test:0.9997291138685115
nb_train:0.4861314983625776
nb_test:0.48337867055427275
