In [63]:
import math
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# import matplotlib.pyplot as plt
import os
import glob
from scipy.sparse import hstack
from scipy.sparse import vstack

In [2]:
all_files = []
# projects = ['abinit', 'libmesh', 'lammps', 'mdanalysis']
projects = ['libmesh']
for sp in projects:
    s = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/{}/".format(sp)
    path = s
    files = glob.glob(os.path.join(path, "*.csv"))
    all_files.extend(files)

In [3]:
print(all_files)

['/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/libmesh/libmesh_v0.3.2_commits_clean.csv', '/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/libmesh/libmesh_v0.6.3_commits_clean.csv', '/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/libmesh/libmesh_v0.9.0_commits_clean.csv', '/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/libmesh/libmesh_v1.3.0-rc2_commits_clean.csv', '/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/libmesh/libmesh_v0.3.3_commits_clean.csv', '/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/libmesh/libmesh_v0.9.2-final_commits_clean.csv', '/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/libmesh/libmesh_v0.9.4-rc2_commits_clean.csv', '/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/libmesh/libmesh_revert_projection4_commits_clean.csv', '/Users/saurabh/workspace/fss

In [4]:
# fname = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/auto/abinit_concat.csv"
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [5]:
# raw_df = pd.read_csv(fname, index_col=0)

In [6]:
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)
raw_df = concatenated_df.drop_duplicates()

In [7]:
buggies = raw_df.groupby("buggy")["hash"].count()
print(buggies)

buggy
0.0    7913
1.0     835
Name: hash, dtype: int64


In [8]:
# remove_n = 1000
# drop_indices = np.random.choice(raw_df.index, remove_n, replace=False)
# new_df = raw_df.drop(drop_indices)
# b = new_df.groupby("buggy")["hash"].count()
# print(b)

In [9]:
print(raw_df.dtypes)

hash        object
time        object
message     object
buggy      float64
dtype: object


In [10]:
# inft = []
# for el in y:
#     if not np.isfinite(el):
#         inft.append(el)
# print(el)
print(raw_df.shape)
raw_df = raw_df.dropna()
print(raw_df.shape)

(8748, 4)
(8745, 4)


In [11]:
y_raw = raw_df['buggy']

In [12]:
lemmatizer = WordNetLemmatizer()
stopset = set(stopwords.words('english'))

In [13]:
raw_df['tknz_msg'] = raw_df['message'].apply(wordpunct_tokenize).apply(lambda tkns: [lemmatizer.lemmatize(w.lower()) for w in tkns])
print(raw_df['tknz_msg'].head(5))

0                    [a, number, of, minor, change, .]
1                              [cv, ignores, for, sgi]
2    [this, commit, wa, manufactured, by, cvs2svn, ...
3    [can, now, write, a, file, in, parallel, --, t...
4              [updated, compatible, to, new, doxygen]
Name: tknz_msg, dtype: object


In [14]:
raw_df['msg'] = raw_df['tknz_msg']\
    .apply(lambda tkns: \
           list(filter(\
                       lambda word: word not in stopset \
                       and word not in string.punctuation\
                       and len(word) > 2\
                       , tkns)))
print(raw_df['msg'].head(5))

0                              [number, minor, change]
1                                       [ignores, sgi]
2    [commit, manufactured, cvs2svn, create, tag, l...
3    [write, file, parallel, format, identical, exi...
4                  [updated, compatible, new, doxygen]
Name: msg, dtype: object


In [15]:
# words = raw_df['msg'].apply(pd.Series).stack().drop_duplicates().tolist()

In [16]:
# print(len(words))
# print(words[:50])

In [17]:
raw_df['msg_str'] = raw_df['msg'].apply(lambda tkns: ' '.join(tkns))

In [18]:
# no_features = min(1000, len(words))
no_features = 1000

tfidf_vectorizer = TfidfVectorizer(max_features=no_features)
X_tfidf = tfidf_vectorizer.fit_transform(raw_df['msg_str'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tf_vectorizer = CountVectorizer(max_features=no_features)
X_tf = tf_vectorizer.fit_transform(raw_df['msg_str'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [19]:
print("tf-idf vectorized: ", X_tfidf.shape)
print("tf vectorized: ", X_tf.shape)

tf-idf vectorized:  (8745, 1000)
tf vectorized:  (8745, 1000)


In [20]:
no_topics = 100
num_iter = 10

In [21]:
# nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X_tfidf)

In [22]:
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=num_iter, learning_method='online', learning_offset=50.,random_state=9, evaluate_every=100).fit(X_tf)

In [23]:
lda_x = lda.transform(X_tf)
print(lda_x.shape)

(8745, 100)


In [24]:
key_words = ["bug", "fix", "wrong", "error", "fail", "problem", "patch"]

In [25]:
topic_indices = []

In [26]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (topic_idx))
        top_features = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
#         print(" ".join(top_features))
        if any(bug_word in top_features for bug_word in key_words):
            print("Topic %d:" % (topic_idx))
            print(" ".join(top_features))
            topic_indices.append(topic_idx)

In [27]:
no_top_words = 10
# display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)
print(topic_indices)

Topic 22:
fix disable option configure typo amr vtk command dof_id_type fparser
Topic 33:
documentation updated fix mpi disable minor reason really outdated several
Topic 49:
function code one problem matrix small initial script solve shape
Topic 58:
file error include header output patch ignore exodus print svn
Topic 63:
fix api change include time every functor paste moving correctly
Topic 69:
file failure boyce testing unsteadysolver hilbert send reading bug meshcommunication
Topic 74:
fixed bug assert switch statement longer eigen thread memory boundary_ids
Topic 90:
use fix complex std regression lagrange iterator access around flagging
Topic 98:
hex8 directly tensor interpolation reset assert reduced_basis able wrong short
[22, 33, 49, 58, 63, 69, 74, 90, 98]


In [28]:
model_top_map = {}
def get_topic_top_words(model, feature_names):
    if str(model) in model_top_map:
        return model_top_map[str(model)]
    topic_top_words = []
    for topic_idx, topic in enumerate(model.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
            topic_top_words.append(top_words)
    model_top_map[str(model)] = topic_top_words
    return topic_top_words

In [29]:
def get_top_topics(words, no_top_words, model, feature_names):
    topic_ranks = []
    topic_top_words = get_topic_top_words(model, feature_names)
    for top_words in topic_top_words:
        topic_freq = 0
        for w in words:
            if w in top_words:
                topic_freq += 1
        topic_ranks.append(topic_freq)
    buggy_topic = 0
    max_val = max(topic_ranks)
    idx = topic_ranks.index(max_val)
    if idx in topic_indices:
        buggy_topic = 1
    return max_val, idx, buggy_topic

In [30]:
raw_df['topic_freq'], raw_df['topic_id'], raw_df['buggy_topic'] = zip(*raw_df['msg'].apply(lambda tkns: get_top_topics(tkns, 20, lda, tf_feature_names)))

In [31]:
# print(raw_df.head(3))

In [32]:
tops_labels = raw_df.groupby(['topic_id','buggy']).size()
for i, v in tops_labels.items():
    if i[0] in topic_indices:
        print('index: ', i, 'value: ', v)

index:  (22, 0.0) value:  373
index:  (22, 1.0) value:  270
index:  (33, 0.0) value:  111
index:  (33, 1.0) value:  20
index:  (49, 0.0) value:  82
index:  (49, 1.0) value:  7
index:  (58, 0.0) value:  194
index:  (58, 1.0) value:  16
index:  (63, 0.0) value:  57
index:  (63, 1.0) value:  10
index:  (69, 0.0) value:  11
index:  (69, 1.0) value:  1
index:  (74, 0.0) value:  86
index:  (74, 1.0) value:  47
index:  (90, 0.0) value:  36
index:  (90, 1.0) value:  17
index:  (98, 0.0) value:  3


In [33]:
test_df = pd.DataFrame()
final_df = raw_df[raw_df['buggy'] == 1]
if len(topic_indices) > 0:
    for idx in topic_indices:
        final_df = final_df.append(raw_df[raw_df['topic_id'] == idx])
    test_df = raw_df[(raw_df['hash'].apply(lambda x: x not in final_df['hash'].values))]
else:
    final_df = raw_df
# final_df = raw_df
use_all_as_test = False
if not test_df.empty and test_df.size > 0:
    use_all_as_test = True

In [34]:
print(raw_df.shape)
print(final_df.shape)
if use_all_as_test:
    print(test_df.shape)

(8745, 10)
(2176, 10)
(6911, 10)


In [35]:
# final_X_tf = tf_vectorizer.transform(final_df['msg_str'])
# X = lda.transform(final_X_tf)
X = tf_vectorizer.transform(final_df['msg_str'])
y = final_df['buggy']
if use_all_as_test:
#     test_X_tf = tf_vectorizer.transform(test_df['msg_str'])
#     test_X = lda.transform(test_X_tf)
    test_X = tf_vectorizer.transform(test_df['msg_str'])
    test_y = test_df['buggy']

In [36]:
print(X.shape)
print(y.shape)
# X = hstack((X,np.array(final_df['topic_id'])[:,None]))
# X = hstack((X,np.array(final_df['topic_freq'])[:,None]))
X = hstack((X,np.array(final_df['buggy_topic'])[:,None]))
print(X.shape)
if use_all_as_test:
#     test_X = hstack((test_X,np.array(test_df['topic_id'])[:,None]))
#     test_X = hstack((test_X,np.array(test_df['topic_freq'])[:,None]))
    test_X = hstack((test_X,np.array(test_df['buggy_topic'])[:,None]))
    print(test_X.shape)

(2176, 1000)
(2176,)
(2176, 1001)
(6911, 1001)


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9, stratify=y)

In [38]:
print("# positive in train set: {}".format(len(y_train[y_train == 1])),
      "\n# negative in train set: {}".format(len(y_train[y_train == 0])))
print("# positive in test set: {}".format(len(y_test[y_test == 1])),
      "\n# negative in test set: {}".format(len(y_test[y_test == 0])))

# positive in train set: 917 
# negative in train set: 715
# positive in test set: 306 
# negative in test set: 238


In [39]:
# clf = MultinomialNB()
# clf = svm.LinearSVC(C=100, loss='hinge', random_state=9, max_iter=500000)
clf = svm.SVC(C=100, kernel='linear', random_state=9)
# clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=9,
  shrinking=True, tol=0.001, verbose=False)

In [40]:
clf.n_support_

array([254, 294], dtype=int32)

In [41]:
if use_all_as_test:
    mod_X_test = vstack((test_X, X_test)).todense()
    print(mod_X_test.shape)
    pred_y = clf.predict(mod_X_test)
#     pred_y = clf.predict(X_test)
else:
    pred_y = clf.predict(X_test)

(7455, 1001)


In [42]:
p_pred_y = clf.predict(X_test)
# print(p_pred_y.shape)
if use_all_as_test:
    p_pred_y = np.append(p_pred_y, clf.predict(test_X))
# print(p_pred_y.shape)

In [43]:
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
if use_all_as_test:
    mod_y_test = np.append(test_y, y_test)
    print(mod_y_test.shape)
#     mod_y_test = y_test
else:
    mod_y_test = y_test
tn, fp, fn, tp = confusion_matrix(mod_y_test, p_pred_y).ravel()

acc = (tp + tn) / (tn + fp + fn + tp)
prec = tp / (tp + fp)
rec = tp / (tp + fn)
f1 = 2 * prec * rec / (prec + rec)

(7455,)


In [44]:
print("accuracy", acc)
print("precision", prec)
print("recall", rec)
print("f1", f1)
print(tn, fp, fn, tp)

accuracy 0.13427230046948357
precision 0.04459259259259259
recall 0.9836601307189542
f1 0.08531746031746032
700 6449 5 301


In [45]:
libmesh_fastread_fname = "/Users/saurabh/workspace/fss/project/data/data-collection/labeled_commits/fastread/abinit_fast_labeled.csv"
libmesh_df = pd.read_csv(libmesh_fastread_fname)

In [46]:
libmesh_df.columns = ["hash", "abstract", 'year', "lnk", "label", "code", "time"]
codes = libmesh_df.groupby("code")["hash"].count()
print(codes)

code
no               445
undetermined    3791
yes              676
Name: hash, dtype: int64


In [82]:
raw_df['hash'] = raw_df['hash'].astype(str)
libmesh_df['hash'] = libmesh_df['hash'].astype(str)
print(libmesh_df.shape)
print(raw_df.shape)
print(libmesh_df.dtypes)
print(raw_df.dtypes)

(4912, 7)
(8745, 10)
hash         object
abstract     object
year          int64
lnk         float64
label       float64
code         object
time        float64
dtype: object
hash            object
time            object
message         object
buggy          float64
tknz_msg        object
msg             object
msg_str         object
topic_freq       int64
topic_id         int64
buggy_topic      int64
dtype: object


In [83]:
libmesh_df_merged = pd.merge(raw_df, libmesh_df, how='inner', on=['hash'], suffixes=("_raw", "_libmesh"))
print(libmesh_df_merged[['hash', 'msg_str', 'buggy_topic', 'buggy', 'code']].head(10))
print(libmesh_df_merged.shape)
libmesh_df_merged_d = libmesh_df_merged[['hash', 'msg_str', 'buggy_topic', 'buggy', 'code']]
print(libmesh_df_merged_d.head(10))
print(libmesh_df_merged_d.shape)

  hash                                            msg_str  buggy_topic  buggy          code
0  0.0  fixing another similar hermite projection erro...            0    1.0  undetermined
1  0.0                       oops forgot define petsc_lib            0    0.0  undetermined
2  0.0      updating configure use silent rule color test            1    0.0  undetermined
3  0.0  reworked inffe inverse_map work infhex8 infpri...            0    0.0  undetermined
4  0.0  merge pull request 1158 roystgnr adaptivity_ex...            0    0.0  undetermined
(5, 16)
  hash                                            msg_str  buggy_topic  buggy          code
0  0.0  fixing another similar hermite projection erro...            0    1.0  undetermined
1  0.0                       oops forgot define petsc_lib            0    0.0  undetermined
2  0.0      updating configure use silent rule color test            1    0.0  undetermined
3  0.0  reworked inffe inverse_map work infhex8 infpri...            0  

In [77]:
X_libmesh = tf_vectorizer.transform(libmesh_df_merged_d['msg_str'])
X_libmesh = hstack((X_libmesh,np.array(libmesh_df_merged_d['buggy_topic'])[:,None]))
print(X_libmesh.shape)
libmesh_df_merged_d['pred_code'] = clf.predict(X_libmesh)
print(libmesh_df_merged_d.shape)

(5, 1001)
(5, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [50]:
y_df = libmesh_df_merged_d[['buggy', 'code', 'pred_code']]
print(y_df.shape)

(5, 3)


In [51]:
y_df = y_df[y_df['code'] != 'undetermined']
print(y_df.shape)

(0, 3)


In [52]:
# model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
# model_knn.fit(X)

In [53]:
# k = 10
# km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5, verbose=1)
# km.fit(X)
# %matplotlib inline

# plt.hist(km.labels_, bins=k)
# plt.show()
# plt.close()

# cluster_assignments_dict = {}
# # print(np.where(km.labels_ == i))
# # print(raw_df.iloc[79]['msg'])
# for i in set(km.labels_):
# #     print(i)
#     current_cluster_vals = [(raw_df.iloc[x]['msg'], raw_df.iloc[x]['buggy']) for x in np.where(km.labels_ == i)[0]]
#     cluster_assignments_dict[i] = current_cluster_vals

# cluster_pick = np.random.choice(len(set(km.labels_)))
# print('Cluster {0}'.format(cluster_pick))
# cluster_assignments_dict[cluster_pick]