In [1]:
import math, re, string, os, glob
import pandas as pd
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, cross_validate
# import matplotlib.pyplot as plt
from scipy.sparse import hstack, vstack, csr_matrix

In [2]:
all_files = []
# projects = ['abinit', 'libmesh', 'lammps', 'mdanalysis']
projects = ['mdanalysis']
for sp in projects:
    s = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/human/{}/".format(sp)
    path = s
    files = glob.glob(os.path.join(path, "*.csv"))
    all_files.extend(files)

In [3]:
# print(all_files)

In [4]:
# fname = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/auto/abinit_concat.csv"
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [5]:
# raw_df = pd.read_csv(fname, index_col=0)

In [6]:
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)
raw_df = concatenated_df.drop_duplicates().reset_index()

In [7]:
buggies = raw_df.groupby("buggy")["hash"].count()
print(buggies)

buggy
0.0    3761
1.0     514
Name: hash, dtype: int64


In [8]:
# remove_n = 1000
# drop_indices = np.random.choice(raw_df.index, remove_n, replace=False)
# new_df = raw_df.drop(drop_indices)
# b = new_df.groupby("buggy")["hash"].count()
# print(b)

In [9]:
print(raw_df.dtypes)

index        int64
hash        object
time        object
message     object
buggy      float64
dtype: object


In [10]:
# inft = []
# for el in y:
#     if not np.isfinite(el):
#         inft.append(el)
# print(el)
print(raw_df.shape)
raw_df = raw_df.dropna().reset_index()
print(raw_df.shape)

(4275, 5)
(4275, 6)


In [11]:
raw_df.dtypes

level_0      int64
index        int64
hash        object
time        object
message     object
buggy      float64
dtype: object

In [12]:
y_raw = raw_df['buggy']

In [13]:
rt = RegexpTokenizer(r'[^\W_]+|[^\W_\s]+')
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer(language='english')
stopset = set(stopwords.words('english'))

In [14]:
raw_df['tknz_msg'] = raw_df['message'].apply(rt.tokenize).apply(lambda tkns: [lemmatizer.lemmatize(w.lower()) for w in tkns])
print(raw_df['tknz_msg'].head(5))

0    [new, builder, module, helper, for, creating, ...
1    [writer, do, not, modify, coordinate, in, plac...
2    [changed, generic, exception, to, specific, on...
3      [updated, pypi, summary, with, mdanalysistests]
4                               [merge, branch, build]
Name: tknz_msg, dtype: object


In [15]:
raw_df['msg'] = raw_df['tknz_msg']\
    .apply(lambda tkns: \
           list(filter(\
                       lambda word: word not in stopset \
                       and word not in string.punctuation\
                       and re.match(r'[^\W\d]*$', word) \
                       and len(word) > 2\
                       , tkns)))
print(raw_df['msg'].head(5))

0    [new, builder, module, helper, creating, charm...
1         [writer, modify, coordinate, place, anymore]
2    [changed, generic, exception, specific, one, d...
3            [updated, pypi, summary, mdanalysistests]
4                               [merge, branch, build]
Name: msg, dtype: object


In [16]:
# words = raw_df['msg'].apply(pd.Series).stack().drop_duplicates().tolist()

In [17]:
# print(len(words))
# print(words[:50])

In [18]:
raw_df['msg_str'] = raw_df['msg'].apply(lambda tkns: ' '.join(tkns))

In [26]:
# no_features = min(1000, len(words))
no_features = 100

# tfidf_vectorizer = TfidfVectorizer(max_features=no_features)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, max_features=no_features)
X_tfidf = tfidf_vectorizer.fit_transform(raw_df['msg_str'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tf_vectorizer = CountVectorizer(max_features=no_features)
# tf_vectorizer = CountVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, max_features=no_features)
X_tf = tf_vectorizer.fit_transform(raw_df['msg_str'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [27]:
print("tf-idf vectorized: ", X_tfidf.shape)
print("tf vectorized: ", X_tf.shape)

tf-idf vectorized:  (4275, 100)
tf vectorized:  (4275, 100)


In [28]:
X_tf_dense = X_tf.todense()

In [29]:
num = 2786
print(raw_df.loc[num]['msg_str'])
val = []
for idx, d in enumerate(X_tf[num][0].toarray()[0]):
#     print(d)
    if d > 0:
        val.append((tf_feature_names[idx], d))
print(val)

version bumped
[('bumped', 1), ('version', 1)]


In [30]:
no_topics = 2
num_iter = 10

In [31]:
# nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X_tfidf)

In [32]:
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=num_iter, learning_method='online', learning_offset=50.,random_state=9, evaluate_every=100).fit(X_tf)

In [33]:
lda_x = lda.transform(X_tf)
print(lda_x.shape)
# print(type(lda_x))

(4275, 2)


In [34]:
key_words = ["bug", "fix", "wrong", "error", "fail", "problem", "patch"]

In [35]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (topic_idx))
        top_features = []
        if topic_idx not in topic_word_prob:
            topic_word_prob[topic_idx] = []
        top_features = [(feature_names[i], topic[i]) for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_word_prob[topic_idx] = top_features
#         print(" ".join(top_features))
        feature_set = set([val[0] for val in top_features])
        feature_names_set.update(feature_set)
        print("Topic %d:" % (topic_idx))
        print(", ".join([val[0] for val in top_features]))
        word_list_for_topic = [val[0] for val in top_features]
        word_probs_for_topic = [val[1] for val in top_features]
        bug_prob_sum = 0
        for bug_word in key_words:
            if bug_word in word_list_for_topic:
                bug_prob_sum += word_probs_for_topic[word_list_for_topic.index(bug_word)]
        topic_indices.append(bug_prob_sum)
#         if any(bug_word in [val[0] for val in top_features] for bug_word in key_words):
#             topic_indices.append(topic_idx)

In [36]:
no_top_words = 10
topic_indices = []
topic_word_prob = {}
feature_names_set = set()

display_topics(lda, tf_feature_names, no_top_words)
feature_names_list = list(feature_names_set)
print(topic_indices)
print(topic_indices.index(max(topic_indices)))
print(topic_word_prob)
print(feature_names_list)

Topic 0:
merge, mdanalysis, doc, develop, pull, request, branch, code, updated, changelog
Topic 1:
test, added, fixed, fix, issue, file, analysis, new, atom, atomgroup
[0, 448.1877849993976]
1
{0: [('merge', 555.9102897967449), ('mdanalysis', 534.6776021360896), ('doc', 533.6061360404185), ('develop', 353.60122974392783), ('pull', 282.9717952530658), ('request', 282.0983983958021), ('branch', 270.0217924624037), ('code', 258.2136106845878), ('updated', 252.9732589308878), ('changelog', 182.21308174985535)], 1: [('test', 859.0895389482779), ('added', 642.8987811692965), ('fixed', 507.5383016851973), ('fix', 448.1877849993976), ('issue', 379.33976704421116), ('file', 278.14987811648797), ('analysis', 252.7459699491908), ('new', 207.9865462067616), ('atom', 205.32336385808003), ('atomgroup', 180.7606210990039)]}
['request', 'changelog', 'fix', 'analysis', 'issue', 'code', 'doc', 'develop', 'mdanalysis', 'merge', 'added', 'atom', 'atomgroup', 'fixed', 'pull', 'file', 'test', 'updated', 'br

In [37]:
model_top_map_cache = {}
def get_topic_top_words(model, feature_names):
    if str(model) in model_top_map_cache:
        return model_top_map_cache[str(model)]
    topic_top_words = []
    for topic_idx, topic in enumerate(model.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
            topic_top_words.append(top_words)
    model_top_map_cache[str(model)] = topic_top_words
    return topic_top_words

In [38]:
def get_top_topics(words, no_top_words, model, feature_names):
    topic_ranks = []
    topic_top_words = get_topic_top_words(model, feature_names)
    for top_words in topic_top_words:
        topic_freq = 0
        for w in words:
            if w in top_words:
                topic_freq += 1
        topic_ranks.append(topic_freq)
    buggy_topic = 0
    max_val = max(topic_ranks)
    idx = topic_ranks.index(max_val)
    if idx in topic_indices:
        buggy_topic = 1
    return max_val, idx, buggy_topic

In [39]:
raw_df['topic_freq'], raw_df['topic_id'], raw_df['buggy_topic'] = zip(*raw_df['msg'].apply(lambda tkns: get_top_topics(tkns, 20, lda, tf_feature_names)))

In [40]:
# print(raw_df.head(3))

In [41]:
tops_labels = raw_df.groupby(['topic_id','buggy']).size()
print(tops_labels)
for i, v in tops_labels.items():
    if i[0] in topic_indices:
        print('index: ', i, 'value: ', v)

topic_id  buggy
0         0.0      2213
          1.0        93
1         0.0      1548
          1.0       421
dtype: int64
index:  (0, 0.0) value:  2213
index:  (0, 1.0) value:  93


In [42]:
# print(lda_x[0][0])

In [43]:
for i in range(no_topics):
    topic_name = "Topic_{}".format(str(i))
    raw_df[topic_name] = pd.Series(lda_x[:, i])
print(raw_df.shape)

(4275, 14)


In [44]:
raw_df.dtypes

level_0          int64
index            int64
hash            object
time            object
message         object
buggy          float64
tknz_msg        object
msg             object
msg_str         object
topic_freq       int64
topic_id         int64
buggy_topic      int64
Topic_0        float64
Topic_1        float64
dtype: object

In [45]:
topic0_word_prob_map = {}
topic1_word_prob_map = {}
topic_index1 = 1
topic_index0 = 0
for d in topic_word_prob[topic_index0]:
        topic0_word_prob_map[d[0]] = d[1]
for d in topic_word_prob[topic_index1]:
        topic1_word_prob_map[d[0]] = d[1]
def doc_word_mapping(words, topic0, topic1, feature_names):
    is_max_0 = True
    topic_prob0 = topic0
    topic_prob1 = topic1
    if topic_prob1 > topic_prob0:
        is_max_0 = False
    weighted_words = [0]*(len(feature_names_list)+3)
    weighted_words[-2] = topic_prob1
    weighted_words[-3] = topic_prob0
    uniq_words = Counter(words)
    for idx, w in enumerate(feature_names_list):
        count = 0
        if w in uniq_words:
            count = uniq_words[w]
        if w in key_words:
            weighted_words[-1] = count * 1
        prob = 0
        if is_max_0 and  w in topic0_word_prob_map:
            prob += count * topic0_word_prob_map[w]
        if not is_max_0 and w in topic1_word_prob_map:
            prob += count * topic1_word_prob_map[w]
        weighted_words[idx] = prob
    return weighted_words

In [46]:
raw_df['word_prob'] = raw_df.apply(lambda x: doc_word_mapping(x['msg'], x['Topic_0'], x['Topic_1'], tf_feature_names), axis=1)

In [47]:
print(raw_df['word_prob'][1278])

[0, 0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 0.0, 0, 0, 0.0, 0.1669209386469128, 0.8330790613530872, 0]


In [48]:
test_df = pd.DataFrame()
final_df = raw_df[raw_df['buggy'] == 1]
if len(topic_indices) > 100:
    for idx in topic_indices:
        final_df = final_df.append(raw_df[raw_df['topic_id'] == idx])
    test_df = raw_df[(raw_df['hash'].apply(lambda x: x not in final_df['hash'].values))]
else:
    final_df = raw_df
# final_df = raw_df
use_all_as_test = False
if not test_df.empty and test_df.size > 0:
    use_all_as_test = True

In [49]:
y_true = raw_df['buggy']
# y_lda = raw_df['topic_id']
y_lda = raw_df['buggy_topic']

In [50]:
# print(y_true[:10])
# print(y_lda[:10])
tn, fp, fn, tp = confusion_matrix(y_true, y_lda).ravel()

acc = (tp + tn) / (tn + fp + fn + tp)
prec = tp / (tp + fp)
rec = tp / (tp + fn)
f1 = 2 * prec * rec / (prec + rec)

beta = 3
f2 = (1+np.power(beta, 2))*prec*rec/(np.power(beta,2)*prec + rec)

print("accuracy", acc)
print("precision", prec)
print("recall", rec)
print("f1", f1)
print("f2", f2)
print(tn, fp, fn, tp)

accuracy 0.383859649122807
precision 0.040329575021682565
recall 0.18093385214007782
f1 0.06595744680851064
f2 0.1341604154645124
1548 2213 421 93


In [51]:
print(raw_df.shape)
print(final_df.shape)
if use_all_as_test:
    print(test_df.shape)

(4275, 15)
(4275, 15)


In [52]:
# buggies = final_df.groupby("buggy")["hash"].count()
# print(buggies)
# ratio = buggies[1]/buggies[0]
# min_ratio = 0.1
# if ratio < min_ratio:
#     buggy_indices = set(final_df.index[final_df['buggy'] == 1].tolist())
#     rem = buggies[0] - int(buggies[1]/min_ratio)
#     drop_indices_rand = set(np.random.choice(final_df.index, rem, replace=False))
#     drop_indices = drop_indices_rand - buggy_indices
#     df_subset = final_df.drop(drop_indices)
#     buggies_new = df_subset.groupby("buggy")["hash"].count()
#     final_df = df_subset
#     print(buggies_new)

In [53]:
# final_X_tf = tf_vectorizer.transform(final_df['msg_str'])
# X = lda.transform(final_X_tf)
# X = tfidf_vectorizer.transform(final_df['msg_str'])
X = final_df['word_prob'].values.tolist()
y = final_df['buggy']
if use_all_as_test:
#     test_X_tf = tf_vectorizer.transform(test_df['msg_str'])
#     test_X = lda.transform(test_X_tf)
#     test_X = tfidf_vectorizer.transform(test_df['msg_str'])
    test_X = test_df['word_prob'].values.tolist()
    test_y = test_df['buggy']

In [54]:
# print(X[1])
# print(y.shape)
# # X = hstack((X,np.array(final_df['topic_id'])[:,None]))
# # X = hstack((X,np.array(final_df['topic_freq'])[:,None]))
# # X = hstack((X,np.array(final_df['buggy_topic'])[:,None]))
# print(X.shape)
# if use_all_as_test:
# #     test_X = hstack((test_X,np.array(test_df['topic_id'])[:,None]))
# #     test_X = hstack((test_X,np.array(test_df['topic_freq'])[:,None]))
# #     test_X = hstack((test_X,np.array(test_df['buggy_topic'])[:,None]))
#     print(test_X.shape)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=9, stratify=y)

In [4898]:
print("# positive in train set: {}".format(len(y_train[y_train == 1])),
      "\n# negative in train set: {}".format(len(y_train[y_train == 0])))
print("# positive in test set: {}".format(len(y_test[y_test == 1])),
      "\n# negative in test set: {}".format(len(y_test[y_test == 0])))

# positive in train set: 411 
# negative in train set: 3009
# positive in test set: 103 
# negative in test set: 752


In [4899]:
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [4900]:
# X_train[9]

In [4901]:
# clf = MultinomialNB()
# clf = LinearSVC(C=100, loss='hinge', random_state=9, max_iter=500000)
clf = SVC(C=100, kernel='linear', random_state=9)
# clf = DecisionTreeClassifier(random_state=0)
# clf = RandomForestClassifier(n_estimators=100, random_state=9)

clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=9,
  shrinking=True, tol=0.001, verbose=False)

In [4902]:
clf.n_support_

array([429, 390], dtype=int32)

In [4903]:
# scoring = ['precision_macro', 'recall_macro']
# estimators = []
# clf1 = MultinomialNB()
# estimators.append(clf1)
# clf2 = SVC(C=100, kernel='linear')
# estimators.append(clf2)
# clf3 = SVC(C=100, kernel='rbf', gamma=0.01)
# estimators.append(clf3)
# eclf = VotingClassifier(estimators=[('nb', clf1), ('svml', clf2), ('svmr', clf3)], voting='hard')
# for clf, label in zip([clf1, clf2, clf3, eclf], ['Naive Bayes', 'SVM Linear', 'SVM RBF', 'Ensemble']):
#     scores = cross_val_score(clf, X, y, cv=5, scoring='f1')
#     print("F1 : %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [4904]:
# clf = eclf.fit(X_train, y_train)

In [4905]:
# if use_all_as_test:
#     mod_X_test = vstack((test_X, X_test)).todense()
#     print(mod_X_test.shape)
#     pred_y = clf.predict(mod_X_test)
# #     pred_y = clf.predict(X_test)
# else:
#     pred_y = clf.predict(X_test)

In [4906]:
p_pred_y = clf.predict(X_test)
print(p_pred_y.shape)
if use_all_as_test:
    p_pred_y = np.append(p_pred_y, clf.predict(test_X))
print(p_pred_y.shape)

(855,)
(855,)


In [4907]:
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
if use_all_as_test:
    mod_y_test = np.append(test_y, y_test)
    print(mod_y_test.shape)
#     mod_y_test = y_test
else:
    mod_y_test = y_test
tn, fp, fn, tp = confusion_matrix(mod_y_test, p_pred_y).ravel()

acc = (tp + tn) / (tn + fp + fn + tp)
prec = tp / (tp + fp)
rec = tp / (tp + fn)
f1 = 2 * prec * rec / (prec + rec)

beta = 3
f2 = (1+np.power(beta, 2))*prec*rec/(np.power(beta,2)*prec + rec)

In [4908]:
print("accuracy", acc)
print("precision", prec)
print("recall", rec)
print("f1", f1)
print("f2", f2)
print(tn, fp, fn, tp)
print(projects)

accuracy 0.8900584795321638
precision 0.5652173913043478
recall 0.3786407766990291
f1 0.4534883720930233
f2 0.39156626506024095
722 30 64 39
['mdanalysis']


In [4909]:
# libmesh_fastread_fname = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/fastread/abinit_fast_labeled.csv"
# libmesh_df = pd.read_csv(libmesh_fastread_fname)

In [4910]:
# libmesh_df.columns = ["hash", "abstract", 'year', "lnk", "label", "code", "time"]
# codes = libmesh_df.groupby("code")["hash"].count()
# print(codes)

In [4911]:
# raw_df['hash'] = raw_df['hash'].astype(str)
# libmesh_df['hash'] = libmesh_df['hash'].astype(str)
# print(libmesh_df.shape)
# print(raw_df.shape)
# print(libmesh_df.dtypes)
# print(raw_df.dtypes)

In [4912]:
# libmesh_df_merged = pd.merge(raw_df, libmesh_df, how='inner', on=['hash'], suffixes=("_raw", "_libmesh"))
# print(libmesh_df_merged[['hash', 'msg_str', 'buggy_topic', 'buggy', 'code']].head(10))
# print(libmesh_df_merged.shape)
# libmesh_df_merged_d = libmesh_df_merged[['hash', 'msg_str', 'buggy_topic', 'buggy', 'code']]
# print(libmesh_df_merged_d.head(10))
# print(libmesh_df_merged_d.shape)

In [4913]:
# X_libmesh = tf_vectorizer.transform(libmesh_df_merged_d['msg_str'])
# X_libmesh = hstack((X_libmesh,np.array(libmesh_df_merged_d['buggy_topic'])[:,None]))
# print(X_libmesh.shape)
# libmesh_df_merged_d['pred_code'] = clf.predict(X_libmesh)
# print(libmesh_df_merged_d.shape)

In [4914]:
# y_df = libmesh_df_merged_d[['buggy', 'code', 'pred_code']]
# print(y_df.shape)

In [4915]:
# y_df = y_df[y_df['code'] != 'undetermined']
# print(y_df.shape)

In [4916]:
# model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
# model_knn.fit(X)

In [4917]:
# k = 10
# km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5, verbose=1)
# km.fit(X)
# %matplotlib inline

# plt.hist(km.labels_, bins=k)
# plt.show()
# plt.close()

# cluster_assignments_dict = {}
# # print(np.where(km.labels_ == i))
# # print(raw_df.iloc[79]['msg'])
# for i in set(km.labels_):
# #     print(i)
#     current_cluster_vals = [(raw_df.iloc[x]['msg'], raw_df.iloc[x]['buggy']) for x in np.where(km.labels_ == i)[0]]
#     cluster_assignments_dict[i] = current_cluster_vals

# cluster_pick = np.random.choice(len(set(km.labels_)))
# print('Cluster {0}'.format(cluster_pick))
# cluster_assignments_dict[cluster_pick]