In [205]:
import math
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, cross_validate
# import matplotlib.pyplot as plt
import os
import glob
from scipy.sparse import hstack
from scipy.sparse import vstack

In [125]:
all_files = []
# projects = ['abinit', 'libmesh', 'lammps', 'mdanalysis']
projects = ['libmesh']
for sp in projects:
    s = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/{}/".format(sp)
    path = s
    files = glob.glob(os.path.join(path, "*.csv"))
    all_files.extend(files)

In [126]:
# print(all_files)

In [127]:
# fname = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/auto/abinit_concat.csv"
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [128]:
# raw_df = pd.read_csv(fname, index_col=0)

In [129]:
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)
raw_df = concatenated_df.drop_duplicates()

In [130]:
buggies = raw_df.groupby("buggy")["hash"].count()
print(buggies)

buggy
0.0    7913
1.0     835
Name: hash, dtype: int64


In [131]:
# remove_n = 1000
# drop_indices = np.random.choice(raw_df.index, remove_n, replace=False)
# new_df = raw_df.drop(drop_indices)
# b = new_df.groupby("buggy")["hash"].count()
# print(b)

In [132]:
print(raw_df.dtypes)

hash        object
time        object
message     object
buggy      float64
dtype: object


In [133]:
# inft = []
# for el in y:
#     if not np.isfinite(el):
#         inft.append(el)
# print(el)
print(raw_df.shape)
raw_df = raw_df.dropna()
print(raw_df.shape)

(8748, 4)
(8745, 4)


In [134]:
y_raw = raw_df['buggy']

In [135]:
lemmatizer = WordNetLemmatizer()
stopset = set(stopwords.words('english'))

In [136]:
raw_df['tknz_msg'] = raw_df['message'].apply(wordpunct_tokenize).apply(lambda tkns: [lemmatizer.lemmatize(w.lower()) for w in tkns])
print(raw_df['tknz_msg'].head(5))

0                    [a, number, of, minor, change, .]
1                              [cv, ignores, for, sgi]
2    [this, commit, wa, manufactured, by, cvs2svn, ...
3    [can, now, write, a, file, in, parallel, --, t...
4              [updated, compatible, to, new, doxygen]
Name: tknz_msg, dtype: object


In [137]:
raw_df['msg'] = raw_df['tknz_msg']\
    .apply(lambda tkns: \
           list(filter(\
                       lambda word: word not in stopset \
                       and word not in string.punctuation\
                       and len(word) > 2\
                       , tkns)))
print(raw_df['msg'].head(5))

0                              [number, minor, change]
1                                       [ignores, sgi]
2    [commit, manufactured, cvs2svn, create, tag, l...
3    [write, file, parallel, format, identical, exi...
4                  [updated, compatible, new, doxygen]
Name: msg, dtype: object


In [138]:
# words = raw_df['msg'].apply(pd.Series).stack().drop_duplicates().tolist()

In [139]:
# print(len(words))
# print(words[:50])

In [140]:
raw_df['msg_str'] = raw_df['msg'].apply(lambda tkns: ' '.join(tkns))

In [242]:
# no_features = min(1000, len(words))
no_features = 100

# tfidf_vectorizer = TfidfVectorizer(max_features=no_features)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=2, max_features=no_features)
X_tfidf = tfidf_vectorizer.fit_transform(raw_df['msg_str'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tf_vectorizer = CountVectorizer(max_features=no_features)
X_tf = tf_vectorizer.fit_transform(raw_df['msg_str'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [243]:
print("tf-idf vectorized: ", X_tfidf.shape)
print("tf vectorized: ", X_tf.shape)

tf-idf vectorized:  (8745, 100)
tf vectorized:  (8745, 100)


In [244]:
no_topics = 2
num_iter = 10

In [245]:
# nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X_tfidf)

In [246]:
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=num_iter, learning_method='online', learning_offset=50.,random_state=9, evaluate_every=100).fit(X_tfidf)

In [247]:
lda_x = lda.transform(X_tf)
print(lda_x.shape)

(8745, 2)


In [248]:
key_words = ["bug", "fix", "wrong", "error", "fail", "problem", "patch"]

In [249]:
topic_indices = []

In [250]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (topic_idx))
        top_features = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
#         print(" ".join(top_features))
        if any(bug_word in top_features for bug_word in key_words):
            print("Topic %d:" % (topic_idx))
            print(" ".join(top_features))
            topic_indices.append(topic_idx)

In [251]:
no_top_words = 10
# display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)
print(topic_indices)

Topic 1:
added used change elem fix meshbase fixed jwpeterson error need
[1]


In [252]:
model_top_map = {}
def get_topic_top_words(model, feature_names):
    if str(model) in model_top_map:
        return model_top_map[str(model)]
    topic_top_words = []
    for topic_idx, topic in enumerate(model.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
            topic_top_words.append(top_words)
    model_top_map[str(model)] = topic_top_words
    return topic_top_words

In [253]:
def get_top_topics(words, no_top_words, model, feature_names):
    topic_ranks = []
    topic_top_words = get_topic_top_words(model, feature_names)
    for top_words in topic_top_words:
        topic_freq = 0
        for w in words:
            if w in top_words:
                topic_freq += 1
        topic_ranks.append(topic_freq)
    buggy_topic = 0
    max_val = max(topic_ranks)
    idx = topic_ranks.index(max_val)
    if idx in topic_indices:
        buggy_topic = 1
    return max_val, idx, buggy_topic

In [254]:
raw_df['topic_freq'], raw_df['topic_id'], raw_df['buggy_topic'] = zip(*raw_df['msg'].apply(lambda tkns: get_top_topics(tkns, 20, lda, tf_feature_names)))

In [255]:
# print(raw_df.head(3))

In [256]:
tops_labels = raw_df.groupby(['topic_id','buggy']).size()
for i, v in tops_labels.items():
    if i[0] in topic_indices:
        print('index: ', i, 'value: ', v)

index:  (1, 0.0) value:  1979
index:  (1, 1.0) value:  633


In [257]:
test_df = pd.DataFrame()
final_df = raw_df[raw_df['buggy'] == 1]
if len(topic_indices) > 0:
    for idx in topic_indices:
        final_df = final_df.append(raw_df[raw_df['topic_id'] == idx])
    test_df = raw_df[(raw_df['hash'].apply(lambda x: x not in final_df['hash'].values))]
else:
    final_df = raw_df
# final_df = raw_df
use_all_as_test = False
if not test_df.empty and test_df.size > 0:
    use_all_as_test = True

In [258]:
print(raw_df.shape)
print(final_df.shape)
if use_all_as_test:
    print(test_df.shape)

(8745, 10)
(3447, 10)
(5897, 10)


In [259]:
# final_X_tf = tf_vectorizer.transform(final_df['msg_str'])
# X = lda.transform(final_X_tf)
X = tfidf_vectorizer.transform(final_df['msg_str'])
y = final_df['buggy']
if use_all_as_test:
#     test_X_tf = tf_vectorizer.transform(test_df['msg_str'])
#     test_X = lda.transform(test_X_tf)
    test_X = tf_vectorizer.transform(test_df['msg_str'])
    test_y = test_df['buggy']

In [260]:
print(X.shape)
print(y.shape)
# X = hstack((X,np.array(final_df['topic_id'])[:,None]))
# X = hstack((X,np.array(final_df['topic_freq'])[:,None]))
# X = hstack((X,np.array(final_df['buggy_topic'])[:,None]))
print(X.shape)
if use_all_as_test:
#     test_X = hstack((test_X,np.array(test_df['topic_id'])[:,None]))
#     test_X = hstack((test_X,np.array(test_df['topic_freq'])[:,None]))
#     test_X = hstack((test_X,np.array(test_df['buggy_topic'])[:,None]))
    print(test_X.shape)

(3447, 100)
(3447,)
(3447, 100)
(5897, 100)


In [261]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9, stratify=y)

In [262]:
print("# positive in train set: {}".format(len(y_train[y_train == 1])),
      "\n# negative in train set: {}".format(len(y_train[y_train == 0])))
print("# positive in test set: {}".format(len(y_test[y_test == 1])),
      "\n# negative in test set: {}".format(len(y_test[y_test == 0])))

# positive in train set: 1101 
# negative in train set: 1484
# positive in test set: 367 
# negative in test set: 495


In [224]:
clf = MultinomialNB()
# clf = svm.LinearSVC(C=100, loss='hinge', random_state=9, max_iter=500000)
# clf = svm.SVC(C=100, kernel='linear', random_state=9)
# clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [225]:
# clf.n_support_

In [263]:
scoring = ['precision_macro', 'recall_macro']
estimators = []
clf1 = MultinomialNB()
estimators.append(clf1)
clf2 = svm.SVC(C=100, kernel='linear')
estimators.append(clf2)
clf3 = svm.SVC(C=100, kernel='rbf', gamma=0.01)
estimators.append(clf3)
eclf = VotingClassifier(estimators=[('nb', clf1), ('svml', clf2), ('svmr', clf3)], voting='hard')
for clf, label in zip([clf1, clf2, clf3, eclf], ['Naive Bayes', 'SVM Linear', 'SVM RBF', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='f1')
    print("F1 : %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

F1 : 0.75 (+/- 0.03) [Naive Bayes]
F1 : 0.76 (+/- 0.03) [SVM Linear]
F1 : 0.76 (+/- 0.03) [SVM RBF]
F1 : 0.76 (+/- 0.03) [Ensemble]


In [264]:
clf = eclf.fit(X_train, y_train)

In [265]:
if use_all_as_test:
    mod_X_test = vstack((test_X, X_test)).todense()
    print(mod_X_test.shape)
    pred_y = clf.predict(mod_X_test)
#     pred_y = clf.predict(X_test)
else:
    pred_y = clf.predict(X_test)

(6759, 100)


In [266]:
p_pred_y = clf.predict(X_test)
# print(p_pred_y.shape)
# if use_all_as_test:
#     p_pred_y = np.append(p_pred_y, clf.predict(test_X))
# print(p_pred_y.shape)

In [267]:
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
if use_all_as_test:
#     mod_y_test = np.append(test_y, y_test)
#     print(mod_y_test.shape)
    mod_y_test = y_test
else:
    mod_y_test = y_test
tn, fp, fn, tp = confusion_matrix(mod_y_test, p_pred_y).ravel()

acc = (tp + tn) / (tn + fp + fn + tp)
prec = tp / (tp + fp)
rec = tp / (tp + fn)
f1 = 2 * prec * rec / (prec + rec)

In [268]:
print("accuracy", acc)
print("precision", prec)
print("recall", rec)
print("f1", f1)
print(tn, fp, fn, tp)

accuracy 0.8062645011600929
precision 0.7136752136752137
recall 0.9100817438692098
f1 0.8
361 134 33 334


In [168]:
# libmesh_fastread_fname = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/fastread/abinit_fast_labeled.csv"
# libmesh_df = pd.read_csv(libmesh_fastread_fname)

In [169]:
# libmesh_df.columns = ["hash", "abstract", 'year', "lnk", "label", "code", "time"]
# codes = libmesh_df.groupby("code")["hash"].count()
# print(codes)

In [170]:
# raw_df['hash'] = raw_df['hash'].astype(str)
# libmesh_df['hash'] = libmesh_df['hash'].astype(str)
# print(libmesh_df.shape)
# print(raw_df.shape)
# print(libmesh_df.dtypes)
# print(raw_df.dtypes)

In [171]:
# libmesh_df_merged = pd.merge(raw_df, libmesh_df, how='inner', on=['hash'], suffixes=("_raw", "_libmesh"))
# print(libmesh_df_merged[['hash', 'msg_str', 'buggy_topic', 'buggy', 'code']].head(10))
# print(libmesh_df_merged.shape)
# libmesh_df_merged_d = libmesh_df_merged[['hash', 'msg_str', 'buggy_topic', 'buggy', 'code']]
# print(libmesh_df_merged_d.head(10))
# print(libmesh_df_merged_d.shape)

In [172]:
# X_libmesh = tf_vectorizer.transform(libmesh_df_merged_d['msg_str'])
# X_libmesh = hstack((X_libmesh,np.array(libmesh_df_merged_d['buggy_topic'])[:,None]))
# print(X_libmesh.shape)
# libmesh_df_merged_d['pred_code'] = clf.predict(X_libmesh)
# print(libmesh_df_merged_d.shape)

In [173]:
# y_df = libmesh_df_merged_d[['buggy', 'code', 'pred_code']]
# print(y_df.shape)

In [174]:
# y_df = y_df[y_df['code'] != 'undetermined']
# print(y_df.shape)

In [175]:
# model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
# model_knn.fit(X)

In [176]:
# k = 10
# km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5, verbose=1)
# km.fit(X)
# %matplotlib inline

# plt.hist(km.labels_, bins=k)
# plt.show()
# plt.close()

# cluster_assignments_dict = {}
# # print(np.where(km.labels_ == i))
# # print(raw_df.iloc[79]['msg'])
# for i in set(km.labels_):
# #     print(i)
#     current_cluster_vals = [(raw_df.iloc[x]['msg'], raw_df.iloc[x]['buggy']) for x in np.where(km.labels_ == i)[0]]
#     cluster_assignments_dict[i] = current_cluster_vals

# cluster_pick = np.random.choice(len(set(km.labels_)))
# print('Cluster {0}'.format(cluster_pick))
# cluster_assignments_dict[cluster_pick]