In [1]:
import pandas as pd
import numpy as np
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, BatchNormalization
from keras.models import Sequential
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

Using TensorFlow backend.


In [2]:
path = "../Dodge/data/issue_close_time/"

In [3]:
dirs = ["1 day", "7 days", "30 days", "90 days", "180 days", "365 days"]

In [4]:
datasets = ["camel", "cocoon", "hive", "hadoop", "cloudstack", "deeplearning", "node", "ofbiz", "qpid"]

In [5]:
# from https://stackoverflow.com/questions/30564015/how-to-generate-random-points-in-a-circular-distribution
def fuzz_data(X, y, radii=(0., .3, .03), classes=[0, 1, 2, 3, 4, 5]):
    counts = []
    for i in classes:
        counts.append(len(np.where(y == i)[0]))
    
    lcm = np.lcm.reduce(counts)
    print("counts =", counts)
    print("lcm =", lcm)
    
    fuzzed_x = []
    fuzzed_y = []
    
    for _, c in enumerate(classes):
        idx = np.where(y == c)[0]
        frac = counts[_] / sum(counts)
        for row in X[idx]:
            for i, r in enumerate(np.arange(*radii)):
                for j in range(int((1./frac) / pow(2., i))):
                    fuzzed_x.append([val - r for val in row])
                    fuzzed_x.append([val + r for val in row])
                    fuzzed_y.append(c)
                    fuzzed_y.append(c)
    
    print(X.shape, np.array(fuzzed_x).shape, y.shape, np.array(fuzzed_y).shape)
    return np.concatenate((X, np.array(fuzzed_x)), axis=0), np.concatenate((y, np.array(fuzzed_y)))

In [6]:
_ = pd.read_csv(f"{path}1 day/camel.csv")

In [7]:
for data in datasets:
    write_train = pd.DataFrame(data=[], columns=_.columns)
    write_test = pd.DataFrame(data=[], columns=_.columns)
    for time in dirs:
        df = pd.read_csv(f"{path}{time}/{data}.csv")
        train_df, test_df = train_test_split(df, test_size=.3)
        
        for row in train_df.iterrows():
            if row[1]["timeOpen"]:
                row[1]["timeOpen"] = time
                write_train = write_train.append(row[1])
        
        for row in test_df.iterrows():
            if row[1]["timeOpen"]:
                row[1]["timeOpen"] = time
                write_test = write_test.append(row[1])
                
    write_train.to_csv("./data/" + data + "_train.csv")
    write_test.to_csv("./data/" + data + "_test.csv")

In [6]:
df = pd.read_csv("camel.csv")

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,issueCleanedBodyLen,meanCommentSizeT,nActorsT,nCommentsT,nCommitsByActorsT,nCommitsByCreator,nCommitsByUniqueActorsT,nCommitsInProject,nCommitsProjectT,nIssuesByCreator,nIssuesByCreatorClosed,nIssuesCreatedInProject,nIssuesCreatedInProjectClosed,nIssuesCreatedProjectClosedT,nIssuesCreatedProjectT,nLabelsT,nSubscribedByT,timeOpen
0,2,11,14,3,1,8113,0,255,452,8223,1,0,258,34,1006,3694,0,0,1 day
1,3,11,0,2,0,2307,0,113,789,2443,6,1,384,107,321,953,0,0,1 day
2,4,8,30,3,2,1338,0,80,1058,1506,1,0,434,171,164,572,0,2,1 day
3,5,33,19,2,2,3256,0,150,972,3429,1,0,377,81,450,1406,0,0,1 day
4,6,6,8,2,2,814,1,62,1192,1023,3,1,493,179,89,359,0,0,1 day


In [8]:
df = df[df.columns[1:]]

In [9]:
df.head()

Unnamed: 0,issueCleanedBodyLen,meanCommentSizeT,nActorsT,nCommentsT,nCommitsByActorsT,nCommitsByCreator,nCommitsByUniqueActorsT,nCommitsInProject,nCommitsProjectT,nIssuesByCreator,nIssuesByCreatorClosed,nIssuesCreatedInProject,nIssuesCreatedInProjectClosed,nIssuesCreatedProjectClosedT,nIssuesCreatedProjectT,nLabelsT,nSubscribedByT,timeOpen
0,11,14,3,1,8113,0,255,452,8223,1,0,258,34,1006,3694,0,0,1 day
1,11,0,2,0,2307,0,113,789,2443,6,1,384,107,321,953,0,0,1 day
2,8,30,3,2,1338,0,80,1058,1506,1,0,434,171,164,572,0,2,1 day
3,33,19,2,2,3256,0,150,972,3429,1,0,377,81,450,1406,0,0,1 day
4,6,8,2,2,814,1,62,1192,1023,3,1,493,179,89,359,0,0,1 day


In [10]:
x = df.drop("timeOpen", axis=1)
y = df["timeOpen"]

In [11]:
x.shape

(4310, 17)

In [12]:
x = np.array(x)

In [13]:
y = np.array([dirs.index(t) for t in y])

In [14]:
y.shape

(4310,)

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [16]:
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [17]:
x_train, y_train = fuzz_data(x_train, y_train)

counts = [544, 347, 139, 242, 518, 1658]
lcm = 681683590361312
(3448, 17) (67180, 17) (3448,) (67180,)


In [18]:
y_train = to_categorical(y_train, num_classes=6)
y_test = to_categorical(y_test, num_classes=6)

In [19]:
model = Sequential([
    Dense(17, activation='relu'),
    BatchNormalization(),
    Dense(17, activation='relu'),
    BatchNormalization(),
    Dense(17, activation='relu'),
    BatchNormalization(),
    Dense(17, activation='relu'),
    BatchNormalization(),
    Dense(6, activation='softmax')
])

In [20]:
model.compile(optimizer='adadelta', loss='categorical_crossentropy')

In [21]:
model.fit(x_train, y_train, batch_size=128, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.callbacks.History at 0x7f8ca9cfb0d0>

In [22]:
preds = model.predict_classes(x_test)

In [23]:
print(classification_report(np.argmax(y_test, axis=1), preds))

              precision    recall  f1-score   support

           0       0.63      0.63      0.63       154
           1       0.27      0.32      0.29        90
           2       0.07      0.21      0.11        28
           3       0.27      0.52      0.35        58
           4       0.38      0.60      0.47       139
           5       0.93      0.44      0.60       393

    accuracy                           0.49       862
   macro avg       0.42      0.45      0.41       862
weighted avg       0.65      0.49      0.52       862



In [24]:
f1_score(np.argmax(y_test, axis=1), preds, average=None)

array([0.62783172, 0.29292929, 0.10909091, 0.35087719, 0.46629213,
       0.6       ])

In [25]:
def distill(x, y, model):
    synthetic_data = np.random.uniform(size=(int(1e5), x.shape[0])) * (np.amax(x, axis=0) - np.amin(x, axis=0)) + np.amin(x, axis=0)
    preds = model.predict_classes(synthetic_data)
    
    student = DecisionTreeClassifier()
    student.fit(synthetic_data, preds)
    return student

In [None]:
student = distill(x_train, y_train, model)

In [None]:
stu_preds = student.predict(x_test)
print(classification_report(np.argmax(y_test, axis=1), stu_preds))