In [1]:
%config Completer.use_jedi = False

In [29]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier

# Apache

In [7]:
# load training set
train_set = pd.concat([pd.read_csv('../data/apache_train_50_all.csv'), pd.read_csv('../data/apache_valid_50_all.csv')])
train_set = train_set[['commit_id', 'buggy']]
train_set['buggy'] = train_set['buggy'].astype(int)
train_set

Unnamed: 0,commit_id,buggy
0,06a456ec1670fb26a86f8a18f5b55017ac961f39,1
1,764ab7a732a72c0585966fcbb62c2b30466738c9,0
2,fba894477f251afdf61f71af08d525e42718afee,0
3,24a4e257ab191187b875800959dd532950d5edbd,0
4,3144c0e9c7b76f1df693f16e982b1da451c42c2c,0
...,...,...
5346,d84b692eb1cc0fbc55f047ec61c6a1c2cabe8b46,0
5347,1284015fa1da5ac72cc2dd71b8266f37a50b65f4,0
5348,14394eed00afee0dd8e3836ed2a65cd7a8077398,0
5349,13e483adeee8d968397a21bde3bb159516f26ff0,1


In [9]:
# load test set
test_set = pd.read_csv('../data/apache_test.csv')
test_set = test_set[['commit_id', 'buggy']]
test_set['buggy'] = test_set['buggy'].astype(int)
test_set

Unnamed: 0,commit_id,buggy
0,b0f422c3861a5a3831e481b8ffac08f6fa085d00,1
1,593dffecd7bad47739a7cbd319d14bb3e911038a,0
2,c561cb316e365ef674784cd6cf0b12c0fbc271a3,1
3,4ebc23ba7b16c7b9acf38b5a864682a6c8890690,0
4,dde018dc7c9fa0e4257d2aa09d2d6d306bc17ccd,0
...,...,...
7535,66ac59f285d1bc8d8b633a0dea318af92734d689,0
7536,10d0e4be6eade7c1685b9c6962bc9b18e33122a8,0
7537,818569f6d0bb282eed58c14c9041670ced3905ad,1
7538,303a7f8a39349e518e0050860a2656f6e58ee704,0


In [11]:
with open('skipped.txt') as fp:
    content = fp.readlines()
skipped = [l.split()[0] for l in content]

final_keys = []
for c in test_set['commit_id'].tolist():
    if c not in skipped:
        final_keys.append(c)

In [12]:
test_set = test_set[test_set['commit_id'].isin(final_keys)]
test_set

Unnamed: 0,commit_id,buggy
0,b0f422c3861a5a3831e481b8ffac08f6fa085d00,1
1,593dffecd7bad47739a7cbd319d14bb3e911038a,0
2,c561cb316e365ef674784cd6cf0b12c0fbc271a3,1
3,4ebc23ba7b16c7b9acf38b5a864682a6c8890690,0
4,dde018dc7c9fa0e4257d2aa09d2d6d306bc17ccd,0
...,...,...
7535,66ac59f285d1bc8d8b633a0dea318af92734d689,0
7536,10d0e4be6eade7c1685b9c6962bc9b18e33122a8,0
7537,818569f6d0bb282eed58c14c9041670ced3905ad,1
7538,303a7f8a39349e518e0050860a2656f6e58ee704,0


In [25]:
X = np.array([1 for _ in range(len(train_set))])
y = np.array(train_set['buggy'])
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X, y)

X_test = np.array([1 for _ in range(len(test_set))])
y_test = np.array(test_set['buggy'])
y_scores = dummy_clf.predict_proba(X_test)[:, 1]
y_prd = dummy_clf.predict(X_test)

In [27]:
auc = roc_auc_score(y_test, y_scores)
p, r, f1, _ = precision_recall_fscore_support(y_test, y_prd, average='binary')

print('auc: {}'.format(auc))
print('f1: {}'.format(f1))
print('precision: {}'.format(p))
print('recall: {}'.format(r))

auc: 0.49434134977361044
f1: 0.27558756633813497
precision: 0.1891751236013531
recall: 0.5073272854152129


# OpenStack

In [31]:
# load training set
train_set = pd.concat([pd.read_csv('../data/openstack_train.csv'), pd.read_csv('../data/openstack_valid.csv')])
with open('../data/openstack_labels.json') as fp:
    labels = json.load(fp)
commits = train_set['commit_id'].tolist()
l = [labels[c] for c in commits]
train_set = pd.DataFrame({'commit_id': commits, 'buggy': l})
train_set

Unnamed: 0,commit_id,buggy
0,19d65cdfd526d7ed5325f40f8dad31b1a04b4c47,0
1,ec75cc2a6bc6e3f367b3377515cb9c4b99a6ab29,0
2,3c9ed23f7939b1ace78b39b53fc4e89d759f6fa5,1
3,1b86380e2e3b2990637d3219ed85da74d70f6371,0
4,87e6697906d577690404815a6cf98ef922952b90,0
...,...,...
9159,0c1fb6a1390b32b1c02c25af5301a0a7631aa52c,0
9160,060148386c1aac20ffe60eebdabcd1607819e61e,0
9161,7607e3da884551e5c4cfe49de390afbddbb2a9b3,0
9162,2a0c679b00ecf8d047a4372bc38f1b2662348f48,0


In [33]:
# load test set
test_set = pd.read_csv('../data/openstack_test.csv')
commits = test_set['commit_id'].tolist()
l = [labels[c] for c in commits]
test_set = pd.DataFrame({'commit_id': commits, 'buggy': l})
test_set

Unnamed: 0,commit_id,buggy
0,844a228c3c42dac802a82510eb17fcfaccb0e1f8,0
1,37627645e47d35643454a84ae4a8c682ee0b18eb,0
2,e1165ce1180bba1a11098cfacfe3c722a084dbcf,0
3,1b9961c4b6785d85e98223f576bd839c0212b3a7,0
4,f184a386271b8305023c90186244a370727b2535,0
...,...,...
1027,1901719542fed0e30cb27efce8935f0257090a7d,0
1028,4a24d140654db9ad769c971b3b879b462dc47616,0
1029,88e03fdd6326abcb3a183adb53118ded69a13213,0
1030,82858a1eeda1618a9b2d08bbb3aae4965dd4ab70,0


In [34]:
X = np.array([1 for _ in range(len(train_set))])
y = np.array(train_set['buggy'])
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X, y)

X_test = np.array([1 for _ in range(len(test_set))])
y_test = np.array(test_set['buggy'])
y_scores = dummy_clf.predict_proba(X_test)[:, 1]
y_prd = dummy_clf.predict(X_test)

In [35]:
auc = roc_auc_score(y_test, y_scores)
p, r, f1, _ = precision_recall_fscore_support(y_test, y_prd, average='binary')

print('auc: {}'.format(auc))
print('f1: {}'.format(f1))
print('precision: {}'.format(p))
print('recall: {}'.format(r))

auc: 0.4967735070344388
f1: 0.16199376947040495
precision: 0.1511627906976744
recall: 0.174496644295302
