In [1]:
#Scripts info:
#Input: File level dataset. Files: "Nova.csv","Ironic.csv", "Base.csv"
#Output: File level result. File: "csv_commented_fileLevel.csv"
#Description: This script is used to generate the result for predicting files to should receive comments for RQ2
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from scipy.sparse import hstack
from scipy import sparse

from lime.lime_tabular import LimeTabularExplainer
import time, pickle, math, warnings, os, operator
import matplotlib.pyplot as plt
import csv
import math

from imblearn.over_sampling import SMOTE
from scipy.optimize import differential_evolution
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier


data_nova = pd.read_csv('./dataset/fileLevel/Nova.csv', dtype=None, sep=',').to_numpy()
data_ironic = pd.read_csv('./dataset/fileLevel/Ironic.csv', dtype=None, sep=',').to_numpy()
data_base = pd.read_csv('./dataset/fileLevel/Base.csv', dtype=None, sep=',').to_numpy()

model_path = './dataset/ml-model'

In [2]:
#generate training/test datset
def getDatasetFromRawData(project_source, data, bias):
    row_data = data[0:,3]
    row_data_Y = data[0:,0]
    if project_source == "qt":
        row_data_deletions = data[0:,6]
        row_data_additions = data[0:,7]
        row_data_changedLine = data[0:,8]
    else:
        row_data_deletions = data[0:,7]
        row_data_additions = data[0:,8]
        row_data_changedLine = data[0:,9]
    Y_train = []
    is_comment = 0
    not_comment = 0
    for element in row_data_Y:
        if(element == 0):
            Y_train.append(False)
            not_comment += 1
        else:
            Y_train.append(True)
            is_comment += 1
    Y_train = np.array(Y_train)
    #finding a index that wouldn't separate file in same changeId into both training dataset and test dataset
    first = int(len(data)*0.6)
    divider = int(len(data)*0.2) + first + bias
    data_count_vect = CountVectorizer(min_df=2, max_df=0.5)
    train_row_data = row_data[:divider]
    test_row_data = row_data[divider:]
    train_row_data_deletions = row_data_deletions[:divider]
    test_row_data_deletions = row_data_deletions[divider:]
    train_row_data_additions = row_data_additions[:divider]
    test_row_data_additions = row_data_additions[divider:]
    train_row_data_changedLine = row_data_changedLine[:divider]
    test_row_data_changedLine = row_data_changedLine[divider:]
    data_train_counts = data_count_vect.fit_transform(train_row_data)
    data_test_counts = data_count_vect.transform(test_row_data)
    final_train_X = np.hstack((data_train_counts.toarray(),train_row_data_deletions[:,None]))
    final_train_X = np.hstack((final_train_X,train_row_data_additions[:,None]))
    final_train_X = np.hstack((final_train_X,train_row_data_changedLine[:,None]))

    final_test_X = np.hstack((data_test_counts.toarray(),test_row_data_deletions[:,None]))
    final_test_X = np.hstack((final_test_X,test_row_data_additions[:,None]))
    final_test_X = np.hstack((final_test_X,test_row_data_changedLine[:,None]))
    final_train_y = Y_train[:divider]
    final_test_y = Y_train[divider:]

    del data_train_counts,data_test_counts,train_row_data,test_row_data,data,row_data,row_data_Y
    return final_train_X,final_train_y,final_test_X,final_test_y,divider,data_count_vect

In [3]:
#show file level prediction result
def printResult(x, y, model):
    print("AUC:",roc_auc_score(y, model.predict_proba(x)[:,1]))

In [4]:
#train random forest model
def trainRFmodel(project,rf_train_X,rf_train_y,rf_test_X,rf_test_y,seed, bias):
    print("RF:"+project)
    train_rf_model_path = model_path+'/smote_abstr_number_df_2_rf_'+project+'-'+str(seed)+str(bias)+'.pkl'
    if not os.path.exists(train_rf_model_path):
        rf = RandomForestClassifier(n_estimators=200,n_jobs=-1,random_state=seed)
        rf_X, rf_y = SMOTE(k_neighbors=10, random_state=seed).fit_resample(rf_train_X, rf_train_y)
        rf.fit(rf_X,rf_y)
        rf_ouput = open(train_rf_model_path, 'wb')
        pickle.dump(rf,rf_ouput)
        print("finish to creat a new rf model")
    else:
        with open(train_rf_model_path,'rb') as f:
            rf = pickle.load(f)
    printResult(rf_test_X,rf_test_y,rf)
    return rf

# TN FP
# FN TP

In [5]:
#train logistic regression model
def trainLGmodel(project,train_X,train_y,test_X,test_y,seed):
    print("LG:"+project)
    train_lg_model_path = model_path+'/smote_abstr_number_df_2_lg_'+project+'-'+str(seed)+'.pkl'
    if not os.path.exists(train_lg_model_path):
        lg = linear_model.LogisticRegression(penalty='l2', C=1, solver = 'newton-cg', random_state=seed)
        lg.fit(train_X,train_y)
        lg_ouput = open(train_lg_model_path, 'wb')
        pickle.dump(lg,lg_ouput)
        print("finish to creat a new lg model")
    else:
        with open(train_lg_model_path,'rb') as f:
            lg = pickle.load(f)
    printResult(test_X,test_y,lg)
    return lg

In [6]:
#NB
def trainNBmodel(project,train_X,train_y,test_X,test_y,seed):
    print("NB:"+project)
    train_nb_model_path = model_path+'/smote_abstr_number_df_2_nb_'+project+'-'+str(seed)+'.pkl'
    if not os.path.exists(train_nb_model_path):
        nb = MultinomialNB()
        nb.fit(train_X,train_y)
        nb_ouput = open(train_nb_model_path, 'wb')
        pickle.dump(nb,nb_ouput)
        print("finish to creat a new nb model")
    else:
        with open(train_nb_model_path,'rb') as f:
            nb = pickle.load(f)
    printResult(test_X,test_y,nb)
    return nb

In [7]:
#DT
def trainDTmodel(project,train_X,train_y,test_X,test_y,seed):
    print("DT:"+project)
    train_dt_model_path = model_path+'/smote_abstr_number_df_2_dt_'+project+'-'+str(seed)+'.pkl'
    if not os.path.exists(train_dt_model_path):
        dt = DecisionTreeClassifier(random_state=seed)
        dt.fit(train_X,train_y)
        dt_ouput = open(train_dt_model_path, 'wb')
        pickle.dump(dt,dt_ouput)
        print("finish to creat a new dt model")
    else:
        with open(train_dt_model_path,'rb') as f:
            dt = pickle.load(f)
    printResult(test_X,test_y,dt)
    return dt

In [8]:
#KNN
def trainKnnmodel(project,train_X,train_y,test_X,test_y,seed):
    print("KNN:"+project)
    train_knn_model_path = model_path+'/smote_abstr_number_df_2_knn_'+project+'-'+str(seed)+'.pkl'
    if not os.path.exists(train_knn_model_path):
        knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
        knn.fit(train_X, train_y)
        knn_ouput = open(train_knn_model_path, 'wb')
        pickle.dump(knn,knn_ouput)
        print("finish to creat a new knn model")
    else:
        with open(train_knn_model_path,'rb') as f:
            knn = pickle.load(f)
    printResult(test_X,test_y,knn)
    return knn

In [9]:
#train random guessing model
def trainDMmodel(project,train_X,train_y,test_X,test_y,seed):
    print("dummy seed:" + str(seed))
    train_dm_model_path = model_path+'/RQ1_2_dm_'+project+'-'+str(seed)+'.pkl'
    if not os.path.exists(train_dm_model_path):
        dm = DummyClassifier(strategy='stratified',random_state=seed)
        dm.fit(train_X, train_y)
        dm_ouput = open(train_dm_model_path, 'wb')
        pickle.dump(dm,dm_ouput)
        print("finish to creat a new dm model")
    else:
        with open(train_dm_model_path,'rb') as f:
            dm = pickle.load(f)
    printResult(test_X,test_y,dm)
    return dm

In [10]:
def getResult(project_source,seed,projectName,data,bias):
    projectName = projectName
    #RF
    train_X, train_y, test_X, test_y,divider,data_count_vect = getDatasetFromRawData(project_source,data, bias)
    rf = trainRFmodel(projectName,train_X, train_y, test_X, test_y,seed, bias)
    lg = trainLGmodel(projectName,train_X, train_y, test_X, test_y,seed)
    nb = trainNBmodel(projectName,train_X, train_y, test_X, test_y,seed)
    dt = trainDTmodel(projectName,train_X, train_y, test_X, test_y,seed)
    knn = trainKnnmodel(projectName,train_X, train_y, test_X, test_y,seed)
    dm = trainDMmodel(projectName,train_X, train_y, test_X, test_y,seed)
    return rf,lg,nb,dt,knn,dm,test_X,test_y


In [11]:
#generate csv for data visualization
def outputCsv():
    csv_path = './dataset/csv/csv_commented_fileLevel.csv'
    csv_file = open(csv_path,"w")
    fieldnames = ['Technique','Datasets','Measure','Value']
    csv_writer = csv.DictWriter(csv_file,quoting=csv.QUOTE_NONE,escapechar='', fieldnames= fieldnames) 
    rf_nova,lg_nova,nb_nova,dt_nova,knn_nova,dm_nova,test_X_nova, test_y_nova = getResult("openstack",2,"nova",data_nova,5)
    rf_ironic,lg_ironic,nb_ironic,dt_ironic,knn_ironic,dm_ironic,test_X_ironic, test_y_ironic = getResult("openstack",2,"ironic",data_ironic,-4)
    rf_base,lg_base,nb_base,dt_base,knn_base,dm_base,test_X_base, test_y_base = getResult("qt",2,"base",data_base,6)
    result = []
    result = generateResult(result,"nova",rf_nova,lg_nova,nb_nova,knn_nova,dt_nova,dm_nova,test_X_nova, test_y_nova)
    result = generateResult(result,"ironic",rf_ironic,lg_ironic,nb_ironic,knn_ironic,dt_ironic,dm_ironic,test_X_ironic, test_y_ironic)
    result = generateResult(result,"base",rf_base,lg_base,nb_base,knn_base,dt_base,dm_base,test_X_base, test_y_base)
    csv_writer.writeheader()
    for row in result:
        csv_writer.writerow(row)
    return result

def generateResult(result, project,rf,lg,nb,knn,dt,dm,x,y):
    result = generateResultList(result, project,"RF",rf,x, y)
    result = generateResultList(result, project,"LG",lg,x, y)
    result = generateResultList(result, project,"NB",nb,x, y)
    result = generateResultList(result, project,"DT",dt,x, y)
    result = generateResultList(result, project,"KNN",knn,x, y)
    result = generateResultList(result, project,"Random Guessing",dm,x, y)
    return result

def generateResultList(result,project,name,model ,x, y):
    result.append({'Technique':name,'Datasets':project,'Measure':'AUC','Value':roc_auc_score(y, model.predict_proba(x)[:,1])})
    result.append({'Technique':name,'Datasets':project,'Measure':'Precision','Value':precision_score(y, model.predict(x))})
    result.append({'Technique':name,'Datasets':project,'Measure':'Recall','Value':recall_score(y, model.predict(x))})
    result.append({'Technique':name,'Datasets':project,'Measure':'F1 measure','Value':f1_score(y, model.predict(x))})
    result.append({'Technique':name,'Datasets':project,'Measure':'MCC','Value':matthews_corrcoef(y, model.predict(x))})
    tn, fp, fn, tp = confusion_matrix(y, model.predict(x)).ravel()
    result.append({'Technique':name,'Datasets':project,'Measure':'true negative','Value':tn})
    result.append({'Technique':name,'Datasets':project,'Measure':'false positive','Value':fp})
    result.append({'Technique':name,'Datasets':project,'Measure':'false negative','Value':fn})
    result.append({'Technique':name,'Datasets':project,'Measure':'true positive','Value':tp})
    return result

In [12]:
outputCsv() 

RF:nova
AUC: 0.7809975830795732
LG:nova
AUC: 0.6131104707870035
NB:nova
finish to creat a new nb model
AUC: 0.5417187412175857
DT:nova
AUC: 0.6682772581343754
KNN:nova
finish to creat a new knn model
AUC: 0.5002401215403487
dummy seed:2
AUC: 0.4895660784722153
RF:ironic
AUC: 0.7819424006726929
LG:ironic
AUC: 0.5755413075467732
NB:ironic
finish to creat a new nb model
AUC: 0.7065745217574102
DT:ironic
AUC: 0.6172482657136852
KNN:ironic
finish to creat a new knn model
AUC: 0.588606264452386
dummy seed:2
AUC: 0.5018078620979609
RF:base
AUC: 0.7039883127065951
LG:base
AUC: 0.5387897073648538
NB:base
finish to creat a new nb model
AUC: 0.6671163796082469
DT:base
AUC: 0.5555525368147359
KNN:base
finish to creat a new knn model
AUC: 0.6021718251386435
dummy seed:2
AUC: 0.48613039530411034


[{'Technique': 'RF',
  'Datasets': 'nova',
  'Measure': 'AUC',
  'Value': 0.7809992362399198},
 {'Technique': 'RF',
  'Datasets': 'nova',
  'Measure': 'Precision',
  'Value': 0.5938864628820961},
 {'Technique': 'RF',
  'Datasets': 'nova',
  'Measure': 'Recall',
  'Value': 0.1878453038674033},
 {'Technique': 'RF',
  'Datasets': 'nova',
  'Measure': 'F1 measure',
  'Value': 0.2854144805876181},
 {'Technique': 'RF',
  'Datasets': 'nova',
  'Measure': 'MCC',
  'Value': 0.20644728147802732},
 {'Technique': 'RF',
  'Datasets': 'nova',
  'Measure': 'true negative',
  'Value': 1578},
 {'Technique': 'RF',
  'Datasets': 'nova',
  'Measure': 'false positive',
  'Value': 93},
 {'Technique': 'RF',
  'Datasets': 'nova',
  'Measure': 'false negative',
  'Value': 588},
 {'Technique': 'RF',
  'Datasets': 'nova',
  'Measure': 'true positive',
  'Value': 136},
 {'Technique': 'LG',
  'Datasets': 'nova',
  'Measure': 'AUC',
  'Value': 0.6131104707870035},
 {'Technique': 'LG',
  'Datasets': 'nova',
  'Measu