In [824]:
import math, re
import pandas as pd
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import string
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, cross_validate
# import matplotlib.pyplot as plt
import os
import glob
from scipy.sparse import hstack, vstack, csr_matrix

In [825]:
all_files = []
# projects = ['abinit', 'libmesh', 'lammps', 'mdanalysis']
projects = ['lammps']
for sp in projects:
    s = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/{}/".format(sp)
    path = s
    files = glob.glob(os.path.join(path, "*.csv"))
    all_files.extend(files)

In [826]:
# print(all_files)

In [827]:
# fname = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/auto/abinit_concat.csv"
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [828]:
# raw_df = pd.read_csv(fname, index_col=0)

In [829]:
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)
raw_df = concatenated_df.drop_duplicates().reset_index()

In [830]:
buggies = raw_df.groupby("buggy")["hash"].count()
print(buggies)

buggy
0.0    10221
1.0      289
Name: hash, dtype: int64


In [831]:
# remove_n = 1000
# drop_indices = np.random.choice(raw_df.index, remove_n, replace=False)
# new_df = raw_df.drop(drop_indices)
# b = new_df.groupby("buggy")["hash"].count()
# print(b)

In [832]:
print(raw_df.dtypes)

index        int64
hash        object
time        object
message     object
buggy      float64
dtype: object


In [833]:
# inft = []
# for el in y:
#     if not np.isfinite(el):
#         inft.append(el)
# print(el)
print(raw_df.shape)
raw_df = raw_df.dropna().reset_index()
print(raw_df.shape)

(10510, 5)
(10510, 6)


In [834]:
raw_df.dtypes

level_0      int64
index        int64
hash        object
time        object
message     object
buggy      float64
dtype: object

In [835]:
y_raw = raw_df['buggy']

In [836]:
rt = RegexpTokenizer(r'[^\W_]+|[^\W_\s]+')
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer(language='english')
stopset = set(stopwords.words('english'))

In [837]:
raw_df['tknz_msg'] = raw_df['message'].apply(rt.tokenize).apply(lambda tkns: [lemmatizer.lemmatize(w.lower()) for w in tkns])
print(raw_df['tknz_msg'].head(5))

0    [allow, extended, lagrangian, on, non, scalar,...
1                                       [first, draft]
2    [draft, the, parallel, construct, based, on, l...
3            [got, fix, momentum, kokkos, to, compile]
4              [preemptive, change, for, kokkos, cuda]
Name: tknz_msg, dtype: object


In [838]:
raw_df['msg'] = raw_df['tknz_msg']\
    .apply(lambda tkns: \
           list(filter(\
                       lambda word: word not in stopset \
                       and word not in string.punctuation\
                       and re.match(r'[^\W\d]*$', word) \
                       and len(word) > 2\
                       , tkns)))
print(raw_df['msg'].head(5))

0    [allow, extended, lagrangian, non, scalar, col...
1                                       [first, draft]
2          [draft, parallel, construct, based, lambda]
3                [got, fix, momentum, kokkos, compile]
4                   [preemptive, change, kokkos, cuda]
Name: msg, dtype: object


In [839]:
# words = raw_df['msg'].apply(pd.Series).stack().drop_duplicates().tolist()

In [840]:
# print(len(words))
# print(words[:50])

In [841]:
raw_df['msg_str'] = raw_df['msg'].apply(lambda tkns: ' '.join(tkns))

In [842]:
# no_features = min(1000, len(words))
no_features = 100

tfidf_vectorizer = TfidfVectorizer(max_features=no_features)
# tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, max_features=no_features)
X_tfidf = tfidf_vectorizer.fit_transform(raw_df['msg_str'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tf_vectorizer = CountVectorizer(max_features=no_features)
X_tf = tf_vectorizer.fit_transform(raw_df['msg_str'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [843]:
print("tf-idf vectorized: ", X_tfidf.shape)
print("tf vectorized: ", X_tf.shape)

tf-idf vectorized:  (10510, 100)
tf vectorized:  (10510, 100)


In [844]:
X_tf_dense = X_tf.todense()

In [845]:
num = 2786
print(raw_df.loc[num]['msg_str'])
val = []
for idx, d in enumerate(X_tf[num][0].toarray()[0]):
#     print(d)
    if d > 0:
        val.append((tf_feature_names[idx], d))
print(val)

git svn svn svn icms temple edu lammps trunk
[('edu', 1), ('git', 1), ('icms', 1), ('lammps', 1), ('svn', 3), ('temple', 1), ('trunk', 1)]


In [846]:
no_topics = 2
num_iter = 10

In [847]:
# nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X_tfidf)

In [848]:
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=num_iter, learning_method='online', learning_offset=50.,random_state=9, evaluate_every=100).fit(X_tfidf)

In [849]:
lda_x = lda.transform(X_tfidf)
print(lda_x.shape)
# print(type(lda_x))

(10510, 2)


In [850]:
key_words = ["bug", "fix", "wrong", "error", "fail", "problem", "patch"]

In [851]:
topic_indices = []
topic_word_prob = {}
feature_names_set = set()
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (topic_idx))
        top_features = []
        if topic_idx not in topic_word_prob:
            topic_word_prob[topic_idx] = []
        top_features = [(feature_names[i], topic[i]) for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_word_prob[topic_idx] = top_features
#         print(" ".join(top_features))
        feature_set = set([val[0] for val in top_features])
        feature_names_set.update(feature_set)
        print("Topic %d:" % (topic_idx))
        print(" ".join([val[0] for val in top_features]))
        if any(bug_word in [val[0] for val in top_features] for bug_word in key_words):
            topic_indices.append(topic_idx)

In [852]:
no_top_words = 10
# display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)
feature_names_list = list(feature_names_set)
print(topic_indices)
print(topic_word_prob)
print(feature_names_list)

Topic 0:
svn lammps git icms trunk temple edu merge request pull
Topic 1:
fix kokkos user update add pair added file dpd doc
[1]
{0: [('svn', 2637.3589543962353), ('lammps', 960.2742421224063), ('git', 884.1371609470139), ('icms', 881.325512862348), ('trunk', 879.5968399429112), ('temple', 879.5968396867047), ('edu', 879.5968339604526), ('merge', 521.470964211534), ('request', 462.91064595475694), ('pull', 462.07000328709836)], 1: [('fix', 625.7623708142486), ('kokkos', 375.73730305902194), ('user', 328.5282969051153), ('update', 278.39028798684984), ('add', 257.76324701827133), ('pair', 226.62864165093544), ('added', 217.17801184331202), ('file', 212.16177371815255), ('dpd', 208.17446667019837), ('doc', 195.85601346555936)]}
['lammps', 'svn', 'kokkos', 'pair', 'icms', 'add', 'edu', 'dpd', 'temple', 'fix', 'git', 'update', 'request', 'added', 'user', 'file', 'trunk', 'merge', 'pull', 'doc']


In [853]:
model_top_map_cache = {}
def get_topic_top_words(model, feature_names):
    if str(model) in model_top_map_cache:
        return model_top_map_cache[str(model)]
    topic_top_words = []
    for topic_idx, topic in enumerate(model.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
            topic_top_words.append(top_words)
    model_top_map_cache[str(model)] = topic_top_words
    return topic_top_words

In [854]:
def get_top_topics(words, no_top_words, model, feature_names):
    topic_ranks = []
    topic_top_words = get_topic_top_words(model, feature_names)
    for top_words in topic_top_words:
        topic_freq = 0
        for w in words:
            if w in top_words:
                topic_freq += 1
        topic_ranks.append(topic_freq)
    buggy_topic = 0
    max_val = max(topic_ranks)
    idx = topic_ranks.index(max_val)
    if idx in topic_indices:
        buggy_topic = 1
    return max_val, idx, buggy_topic

In [855]:
raw_df['topic_freq'], raw_df['topic_id'], raw_df['buggy_topic'] = zip(*raw_df['msg'].apply(lambda tkns: get_top_topics(tkns, 20, lda, tf_feature_names)))

In [856]:
# print(raw_df.head(3))

In [857]:
tops_labels = raw_df.groupby(['topic_id','buggy']).size()
print(tops_labels)
for i, v in tops_labels.items():
    if i[0] in topic_indices:
        print('index: ', i, 'value: ', v)

topic_id  buggy
0         0.0      6916
          1.0       106
1         0.0      3305
          1.0       183
dtype: int64
index:  (1, 0.0) value:  3305
index:  (1, 1.0) value:  183


In [858]:
# print(lda_x[0][0])

In [859]:
for i in range(no_topics):
    topic_name = "Topic_{}".format(str(i))
    raw_df[topic_name] = pd.Series(lda_x[:, i])
print(raw_df.shape)

(10510, 14)


In [860]:
raw_df.dtypes

level_0          int64
index            int64
hash            object
time            object
message         object
buggy          float64
tknz_msg        object
msg             object
msg_str         object
topic_freq       int64
topic_id         int64
buggy_topic      int64
Topic_0        float64
Topic_1        float64
dtype: object

In [861]:
def doc_word_mapping(words, topic0, topic1, feature_names):
    topic_index0 = 0
    topic_prob0 = topic0
    topic_index1 = 1
    topic_prob1 = topic1
    topic0_word_prob_map = {}
    topic1_word_prob_map = {}
    for d in topic_word_prob[topic_index0]:
        topic0_word_prob_map[d[0]] = d[1]
    for d in topic_word_prob[topic_index1]:
        topic1_word_prob_map[d[0]] = d[1]
    weighted_words = [0]*(2*no_top_words)
    uniq_words = Counter(words)
    for w, count in uniq_words.items():
        if w not in feature_names_list:
            continue
        prob = 0
        if w in topic0_word_prob_map:
            prob += topic_prob0 * count * topic0_word_prob_map[w]
        if w in topic1_word_prob_map:
            prob += topic_prob1 * count * topic1_word_prob_map[w]
        weighted_words[feature_names_list.index(w)] = prob/2
    return weighted_words

In [862]:
raw_df['word_prob'] = raw_df.apply(lambda x: doc_word_mapping(x['msg'], x['Topic_0'], x['Topic_1'], tf_feature_names), axis=1)

In [863]:
print(raw_df['word_prob'][2786])

[407.7805905668207, 3359.8645411441744, 0, 0, 374.2550016985702, 0, 373.52091796228535, 0, 373.5209203939391, 0, 375.44896844904457, 0, 0, 0, 0, 0, 373.5209205027372, 0, 0, 0]


In [864]:
test_df = pd.DataFrame()
final_df = raw_df[raw_df['buggy'] == 1]
if len(topic_indices) > 0:
    for idx in topic_indices:
        final_df = final_df.append(raw_df[raw_df['topic_id'] == idx])
    test_df = raw_df[(raw_df['hash'].apply(lambda x: x not in final_df['hash'].values))]
else:
    final_df = raw_df
# final_df = raw_df
use_all_as_test = False
if not test_df.empty and test_df.size > 0:
    use_all_as_test = True

In [865]:
y_true = raw_df['buggy']
# y_lda = raw_df['topic_id']
y_lda = raw_df['buggy_topic']

In [866]:
# print(y_true[:10])
# print(y_lda[:10])
tn, fp, fn, tp = confusion_matrix(y_true, y_lda).ravel()

acc = (tp + tn) / (tn + fp + fn + tp)
prec = tp / (tp + fp)
rec = tp / (tp + fn)
f1 = 2 * prec * rec / (prec + rec)

beta = 3
f2 = (1+np.power(beta, 2))*prec*rec/(np.power(beta,2)*prec + rec)

print("accuracy", acc)
print("precision", prec)
print("recall", rec)
print("f1", f1)
print("f2", f2)
print(tn, fp, fn, tp)

accuracy 0.6754519505233111
precision 0.05246559633027523
recall 0.6332179930795848
f1 0.09690230341540904
f2 0.300541960913122
6916 3305 106 183


In [867]:
print(raw_df.shape)
print(final_df.shape)
if use_all_as_test:
    print(test_df.shape)

(10510, 15)
(3777, 15)
(6796, 15)


In [868]:
# final_X_tf = tf_vectorizer.transform(final_df['msg_str'])
# X = lda.transform(final_X_tf)
# X = tfidf_vectorizer.transform(final_df['msg_str'])
X = final_df['word_prob'].values.tolist()
y = final_df['buggy']
if use_all_as_test:
#     test_X_tf = tf_vectorizer.transform(test_df['msg_str'])
#     test_X = lda.transform(test_X_tf)
#     test_X = tfidf_vectorizer.transform(test_df['msg_str'])
    test_X = test_df['word_prob'].values.tolist()
    test_y = test_df['buggy']

In [869]:
# print(X[1])
# print(y.shape)
# # X = hstack((X,np.array(final_df['topic_id'])[:,None]))
# # X = hstack((X,np.array(final_df['topic_freq'])[:,None]))
# # X = hstack((X,np.array(final_df['buggy_topic'])[:,None]))
# print(X.shape)
# if use_all_as_test:
# #     test_X = hstack((test_X,np.array(test_df['topic_id'])[:,None]))
# #     test_X = hstack((test_X,np.array(test_df['topic_freq'])[:,None]))
# #     test_X = hstack((test_X,np.array(test_df['buggy_topic'])[:,None]))
#     print(test_X.shape)

In [870]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9, stratify=y)

In [871]:
print("# positive in train set: {}".format(len(y_train[y_train == 1])),
      "\n# negative in train set: {}".format(len(y_train[y_train == 0])))
print("# positive in test set: {}".format(len(y_test[y_test == 1])),
      "\n# negative in test set: {}".format(len(y_test[y_test == 0])))

# positive in train set: 354 
# negative in train set: 2478
# positive in test set: 118 
# negative in test set: 827


In [872]:
# clf = MultinomialNB()
# clf = svm.LinearSVC(C=100, loss='hinge', random_state=9, max_iter=500000)
clf = svm.SVC(C=1000, kernel='linear', random_state=9, max_iter=50000000)
# clf = DecisionTreeClassifier(random_state=0)
# X_train = csr_matrix(X_train)
# X_test = csr_matrix(X_test)
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=50000000, probability=False, random_state=9,
  shrinking=True, tol=0.001, verbose=False)

In [873]:
clf.n_support_

array([412, 353], dtype=int32)

In [874]:
# scoring = ['precision_macro', 'recall_macro']
# estimators = []
# clf1 = MultinomialNB()
# estimators.append(clf1)
# clf2 = svm.SVC(C=100, kernel='linear')
# estimators.append(clf2)
# clf3 = svm.SVC(C=100, kernel='rbf', gamma=0.01)
# estimators.append(clf3)
# eclf = VotingClassifier(estimators=[('nb', clf1), ('svml', clf2), ('svmr', clf3)], voting='hard')
# for clf, label in zip([clf1, clf2, clf3, eclf], ['Naive Bayes', 'SVM Linear', 'SVM RBF', 'Ensemble']):
#     scores = cross_val_score(clf, X, y, cv=5, scoring='f1')
#     print("F1 : %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [875]:
# clf = eclf.fit(X_train, y_train)

In [876]:
# if use_all_as_test:
#     mod_X_test = vstack((test_X, X_test)).todense()
#     print(mod_X_test.shape)
#     pred_y = clf.predict(mod_X_test)
# #     pred_y = clf.predict(X_test)
# else:
#     pred_y = clf.predict(X_test)

In [877]:
p_pred_y = clf.predict(X_test)
print(p_pred_y.shape)
if use_all_as_test:
    p_pred_y = np.append(p_pred_y, clf.predict(test_X))
print(p_pred_y.shape)

(945,)
(7741,)


In [878]:
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
if use_all_as_test:
    mod_y_test = np.append(test_y, y_test)
    print(mod_y_test.shape)
#     mod_y_test = y_test
else:
    mod_y_test = y_test
tn, fp, fn, tp = confusion_matrix(mod_y_test, p_pred_y).ravel()

acc = (tp + tn) / (tn + fp + fn + tp)
prec = tp / (tp + fp)
rec = tp / (tp + fn)
f1 = 2 * prec * rec / (prec + rec)

(7741,)


In [879]:
print("accuracy", acc)
print("precision", prec)
print("recall", rec)
print("f1", f1)
print(tn, fp, fn, tp)

accuracy 0.8578994961891229
precision 0.005040322580645161
recall 0.0423728813559322
f1 0.009009009009009007
6636 987 113 5


In [880]:
# libmesh_fastread_fname = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/fastread/abinit_fast_labeled.csv"
# libmesh_df = pd.read_csv(libmesh_fastread_fname)

In [881]:
# libmesh_df.columns = ["hash", "abstract", 'year', "lnk", "label", "code", "time"]
# codes = libmesh_df.groupby("code")["hash"].count()
# print(codes)

In [882]:
# raw_df['hash'] = raw_df['hash'].astype(str)
# libmesh_df['hash'] = libmesh_df['hash'].astype(str)
# print(libmesh_df.shape)
# print(raw_df.shape)
# print(libmesh_df.dtypes)
# print(raw_df.dtypes)

In [883]:
# libmesh_df_merged = pd.merge(raw_df, libmesh_df, how='inner', on=['hash'], suffixes=("_raw", "_libmesh"))
# print(libmesh_df_merged[['hash', 'msg_str', 'buggy_topic', 'buggy', 'code']].head(10))
# print(libmesh_df_merged.shape)
# libmesh_df_merged_d = libmesh_df_merged[['hash', 'msg_str', 'buggy_topic', 'buggy', 'code']]
# print(libmesh_df_merged_d.head(10))
# print(libmesh_df_merged_d.shape)

In [884]:
# X_libmesh = tf_vectorizer.transform(libmesh_df_merged_d['msg_str'])
# X_libmesh = hstack((X_libmesh,np.array(libmesh_df_merged_d['buggy_topic'])[:,None]))
# print(X_libmesh.shape)
# libmesh_df_merged_d['pred_code'] = clf.predict(X_libmesh)
# print(libmesh_df_merged_d.shape)

In [885]:
# y_df = libmesh_df_merged_d[['buggy', 'code', 'pred_code']]
# print(y_df.shape)

In [886]:
# y_df = y_df[y_df['code'] != 'undetermined']
# print(y_df.shape)

In [887]:
# model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
# model_knn.fit(X)

In [888]:
# k = 10
# km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5, verbose=1)
# km.fit(X)
# %matplotlib inline

# plt.hist(km.labels_, bins=k)
# plt.show()
# plt.close()

# cluster_assignments_dict = {}
# # print(np.where(km.labels_ == i))
# # print(raw_df.iloc[79]['msg'])
# for i in set(km.labels_):
# #     print(i)
#     current_cluster_vals = [(raw_df.iloc[x]['msg'], raw_df.iloc[x]['buggy']) for x in np.where(km.labels_ == i)[0]]
#     cluster_assignments_dict[i] = current_cluster_vals

# cluster_pick = np.random.choice(len(set(km.labels_)))
# print('Cluster {0}'.format(cluster_pick))
# cluster_assignments_dict[cluster_pick]