<a href="https://colab.research.google.com/github/zeynabChitsazian/sdp_oob/blob/main/sdp_oob.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code is to identify the defective commit of individual projects using online adaptive GMM.

In [None]:
pip install shap

In [None]:
import os
import sys
import copy
import math
import pickle
import numpy as np
import pandas as pd
import librosa
from joblib import dump
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from itertools import combinations
from sklearn.decomposition import PCA
import shap

In [None]:
#merging the datasets as daily
from datetime import datetime
def merging_datasets(outputPath, inputPath, dfs_path):
  contents = []
  for filename in os.listdir(inputPath):
    print(filename)
    content = pd.read_csv(inputPath + filename)
    contents.append(content) #[::-1] and reset index

  combined_data = pd.concat(contents)

  def extract_month_year(row):
    date = datetime.utcfromtimestamp(row['author_date']).date()
    year, month, dnum = date.strftime("%Y-%m-%d").split("-")
    return pd.Series({'dnum': dnum, 'month': month, 'year': year})

  # Apply the function to all rows of the data frame
  combined_data[['dnum', 'month', 'year']] = combined_data.apply(lambda row: extract_month_year(row), axis=1)
  combined_data['year'] = combined_data['year'].astype(int)
  combined_data['month'] = combined_data['month'].astype(int)
  combined_data['dnum'] = combined_data['dnum'].astype(int)

  #Each cell in 'dfs' include commits of 1 day
  dfs = [] #merged datasets

  for year in range(2003 , 2020):
    print('year: ', year)
    for month in range(1, 13):
      for dnum in range(1, 32):
        new_df = combined_data[(combined_data['dnum'] == dnum) & (combined_data['month'] == month) & (combined_data['year'] == year)]
        dfs.append(new_df)

  if not os.path.exists(outputPath):
      os.makedirs(outputPath)
      print(f"Folder '{outputPath}' created successfully.")
  # Save the array dfs to a file
  with open(dfs_path, 'wb') as file:
      pickle.dump(dfs, file)

  print('dfs size: ', len(dfs))
  return dfs

In [None]:
#separating a dataset as daily
def separating_daily(outputPath, inputPath, target_fileName, dfTest_path):

    content_test = pd.read_csv(inputPath + target_fileName + '.csv')

    def extract_month_year(row):
      date = datetime.utcfromtimestamp(row['author_date']).date()
      year, month, dnum = date.strftime("%Y-%m-%d").split("-")
      return pd.Series({'dnum': dnum, 'month': month, 'year': year})

    content_test[['dnum','month', 'year']] = content_test.apply(lambda row: extract_month_year(row), axis=1)
    content_test['year'] = content_test['year'].astype(int)
    content_test['month'] = content_test['month'].astype(int)
    content_test['dnum'] = content_test['dnum'].astype(int)

    #Each cell in 'df_test' include commits of 1 day
    df_test = []

    for year in range(2003 , 2020):
      print('year: ', year)
      for month in range(1, 13):
        for dnum in range(1, 32):
          new_dfTest = content_test[(content_test['dnum'] == dnum) & (content_test['month'] == month) & (content_test['year'] == year)]
          df_test.append(new_dfTest)

    folderName = outputPath + target_fileName
    if not os.path.exists(folderName):
      os.makedirs(folderName)
      print(f"Folder '{folderName}' created successfully.")

    with open(dfTest_path, 'wb') as file:
        pickle.dump(df_test, file)

    print('df_test size: ', len(df_test))
    return df_test

In [None]:
#pip install numpy==1.23.5

In [None]:
#pip install scikit-multiflow

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!


In [None]:
import condacolab
!conda install -c anaconda scikit-multiflow

In [None]:
from skmultiflow.meta import OzaBagging
from skmultiflow.trees import HoeffdingTree
def create_initial_model(data):
  y_train = data['contains_bug']
  X_train = data.drop('contains_bug', axis=1)
  X_train_array = X_train.to_numpy()
  y_train_array = y_train.to_numpy()
  classes = np.unique(y_train_array)
  oza_bagging = OzaBagging(base_estimator=HoeffdingTree(), n_estimators=20)
  oza_bagging.partial_fit(X_train_array, y_train_array, classes=classes)

  return oza_bagging

In [None]:
#In the code below, we have obtained recall for defect and clean using equation
#in sadia paper which is based on fading factor
def update_model(model, data):
  teta=0.99
  correct_labels = data['contains_bug']
  dataDropLabels = data.drop('contains_bug', axis=1)
  X_train = dataDropLabels.to_numpy()
  y_train = correct_labels.to_numpy()
  predictions = model.predict(X_train)
  for i in range(len(data)):
    correct_label = int(correct_labels[i])
    imRate[correct_label] = round(teta * imRate[correct_label] + (1-teta)*int(predictions[i] == correct_label),2)
    lamb = 1
    if correct_label == 1 and imRate[1] < imRate[0]:
      lamb = imRate[0]/imRate[1]
    elif correct_label == 0 and imRate[0] < imRate[1]:
      lamb = imRate[1]/imRate[0]

    k = np.random.poisson(lamb)
    X = np.tile(np.array(X_train[i]), (k, 1))
    Y = np.full(k, y_train[i])
    model.partial_fit(X, Y)

  return model

In [None]:
def initial_evaluation(testData):
  global recallNum
  teta=0.99
  correct_labels = testData['contains_bug'].to_numpy()
  testDataDropLabels = testData.drop('contains_bug', axis=1).to_numpy()
  prediction = 0
  for i in range(len(testDataDropLabels)):
    recall = []
    recall.append(recalls[0][-1])
    recall.append(recalls[1][-1])
    correct_label = int(correct_labels[i])
    R[correct_label] = teta * R[correct_label] + int(prediction == correct_label)
    N[correct_label] = teta * N[correct_label] + 1
    recall_value = round((R[correct_label]/N[correct_label])*100,2)
    recalls[correct_label].append(recall_value)
    temp[correct_label].append(recall_value)

    recalls[1-correct_label].append(recall[1-correct_label])
    temp[1-correct_label].append(recall[1-correct_label])
    recallNum += 1

In [None]:
#In the code below, we have obtained recall for defect and clean using equation
#in sadia paper which is based on fading factor
def evaluation(online_bagging_classifier, testData):
  global recallNum
  teta=0.99
  correct_labels = testData['contains_bug'].to_numpy()
  testDataDropLabels = testData.drop('contains_bug', axis=1).to_numpy()
  predictions = online_bagging_classifier.predict(testDataDropLabels)

  for i in range(len(testDataDropLabels)):
    recall = []
    recall.append(recalls[0][-1])
    recall.append(recalls[1][-1])

    correct_label = int(correct_labels[i])
    R[correct_label] = teta * R[correct_label] + int(predictions[i] == correct_label)
    N[correct_label] = teta * N[correct_label] + 1
    recall_value = round((R[correct_label]/N[correct_label])*100,2)
    recalls[correct_label].append(recall_value)
    temp[correct_label].append(recall_value)

    recalls[1-correct_label].append(recall[1-correct_label])
    temp[1-correct_label].append(recall[1-correct_label])
    recallNum += 1

In [None]:
def data_separation (data):

  clean_samples = data.drop(data[data['contains_bug'] == 1].index).drop('contains_bug', axis=1)
  defect_samples = data.drop(data[data['contains_bug'] == 0].index).drop('contains_bug', axis=1)

  return clean_samples,defect_samples

In [None]:
def data_normalization(data,status):

  if status == 1:
    #entropy = entropy/log2(n)
    data['entrophy'] = data.apply(lambda row: row['entrophy'] / np.log2(row['nf']) if (row['nf'] != 0 and row['nf'] != 1) else row['entrophy'], axis=1)
    data['nuc'] = data.apply(lambda row: row['nuc'] / row['nf'] if row['nf'] != 0 else row['nuc'], axis=1)
    data['lt'] = data.apply(lambda row: row['lt'] / row['nf'] if row['nf'] != 0 else row['lt'], axis=1)
    data['lt'] = np.abs(data['lt'])
    data['churn'] = data.apply(lambda row: (row['la'] + row['ld']) / row['lt'] if row['lt'] != 0 else row['la'] + row['ld'], axis=1)

  return data

In [None]:
def remove_correlation_features (data, Remove_extra_correlations_status):

  correlated_features = []
  #Check the correlation between the columns as 2 to 2
  if Remove_extra_correlations_status == 1:
    columns = data.columns.tolist()
    for col1, col2 in combinations(columns, 2):
      correlation, p_value = spearmanr(data[col1], data[col2])
      if correlation > 0.8 and p_value < 0.05:
        if col1 not in correlated_features and col2 not in correlated_features:
          correlated_features.append(col1)
  independent_data = data.drop(correlated_features, axis=1)

  return independent_data

In [None]:
def log_transform (data, status):

  if status == 1:
    data = data.astype(float)
    data = np.log2(data[data != 0])
    data.fillna(0, inplace=True)

  return data

In [None]:
def data_scaling (data, status):

  if status == 1:
    scaler = StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

  return data

In [None]:
def PCA_decomposition(data, n_com, status):

  if status == 1:
    pca = PCA(n_components = n_com)
    data_transformed = pca.fit_transform(data)
    data_transformed = pd.DataFrame(data_transformed)
    data_transformed.columns = data_transformed.columns.astype(str)
    return data_transformed
  else:
    return data

In [None]:
def drop_nf0(data, status):
  if status == 1:
    #To remove the rows in which the number of changed files is equal to 0. Because it demostrates that no changes have been applied to the commit
    data = data[data['nf'] != 0].reset_index(drop=True)
  else:
    data = data.reset_index(drop=True)
  return data

In this part, each line shows a step of data processing. So you can optionally remove each line and evaluate the overall performance

In [None]:
def data_preprocessing (data):

  data = drop_nf0(data, 0)

  #normalizing data except fix feature and label column ('contains_bug')
  column_fix = data['fix']
  column_contains_bug = data['contains_bug']
  #column_classification = data['classification']
  data = data.drop('fix', axis=1).drop('contains_bug', axis=1).drop('dnum', axis=1).drop('month', axis=1).drop('year', axis=1).drop('author_date', axis=1)#.drop('classification', axis=1)

  data = data_normalization(data,1)
  #print('normalized_data: ', data.head(1))

  #1) nf, exp yes. etc no
  #data = data.drop('la', axis=1).drop('ld', axis=1).drop('nd', axis=1).drop('ns', axis=1).drop('rexp', axis=1).drop('sexp', axis=1)

  #2) only nf, etc
  #data = data.drop('exp', axis=1).drop('ndev', axis=1).drop('age', axis=1).drop('rexp', axis=1).drop('sexp', axis=1)

  #3) nf only no etc
  #data = data.drop('la', axis=1).drop('ld', axis=1).drop('nd', axis=1).drop('ns', axis=1).drop('rexp', axis=1).drop('sexp', axis=1)
  #data = data.drop('exp', axis=1).drop('ndev', axis=1).drop('age', axis=1)

  #4) no nf etc
  #data = data.drop('la', axis=1).drop('ld', axis=1)

  #data = data.drop('nd', axis=1).drop('ns', axis=1)

  #5) no exp etc
  #data = data.drop('rexp', axis=1).drop('sexp', axis=1)

  data = log_transform (data, 1)
  #data = data_scaling(data, 1)
  data['fix'] = column_fix
  final_data = remove_correlation_features(data, 0)
  #finishing preprocessing steps

  final_data['contains_bug'] = column_contains_bug

  return final_data

In [None]:
import time
import math
import warnings
from sklearn.preprocessing import OneHotEncoder
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="invalid value encountered in log2")
#------------------------------------------------------------------------------- Settings
wp = True
if wp: pklFile_rec = 'oob_recalls_wp.pkl'
else: pklFile_rec = 'oob_recalls_dfs.pkl'
interpret_defectPredict = True
#------------------------------------------------------------------------------- Import datasets
input_dir = "drive/MyDrive/part2/data/sadia_csv/"
output_dir = 'drive/MyDrive/part2/data/code_outputs/'
dfs_path = 'drive/MyDrive/part2/data/dfs' + '.pkl'
files = os.listdir(input_dir)
for filename in files:
  target_fileName = filename[:-4]
  dfTest_path = output_dir + target_fileName + '/df_test.pkl'
  if os.path.isfile(dfTest_path):
    with open(dfTest_path, 'rb') as file:
        df_test = pickle.load(file)
        print('There is the df_test file for ' + target_fileName + ' project.')
  else: df_test = separating_daily(output_dir, input_dir, target_fileName, dfTest_path)

  if wp:
    dfs = df_test
  else:
    if os.path.isfile(dfs_path):
      # Load the array dfs from the saved file
      with open(dfs_path , 'rb') as file:
        dfs = pickle.load(file)
        print('There is the dfs' + ' file.')
    else: dfs = merging_datasets(output_dir, input_dir, dfs_path)

#------------------------------------------------------------------------------- Main code
  start_time = time.time()
  warnings.filterwarnings("ignore", category=FutureWarning)
  warnings.filterwarnings("ignore", message="invalid value encountered in log2")
  all_start_time = time.time()
#------------------------------------------------------------------------------- Adjust the settings
  UpMflag = True #Update model flag
  trainData_len = 1
  Initial_size = 1 #The minimum number of samples required to build the model
  vl = 90 #verification latency
  startNum = 0
  endNum = 5
  rowNum = 0
  defect_accsMean = 0
  clean_accsMean = 0
  flag = False
  initData = pd.DataFrame()
  testNum = 0
  trainNum = 0
#------------------------------------------------------------------------------- Global variables
  global imRate
  imRate = [1, 1]
  global first_imRate
  first_imRate = [True, True]
  global num_init_sam_rate
  num_init_sam_rate = [0, 0]
  #If the new sample is a member of class C, the recall of the other class remains the same as the previous recall
  global recalls
  recalls = [[0],[0]]
  global recallNum
  recallNum = 0
  global temp
  temp = [[0],[0]]
  global N
  N = [0, 0]
  global R
  R = [0, 0]
  global iN
  iN = [1, 1]
  global iR
  iR = [1, 1]
  def updatemodel (UpMflag, trainData, i):
    if UpMflag: return pd.concat([trainData, dfs[i]], ignore_index=True)
    else: return trainData
#------------------------------------------------------------------------------ Creating training DS

  while True:
    #this shows, wether there is any elements in this section (for larger speed)
    if len(pd.concat(dfs[startNum:endNum], ignore_index=True)) > 0:
      for rowNum in range(startNum,endNum):
        initData = pd.concat([initData, dfs[rowNum]], ignore_index=True)
        if len(initData) > Initial_size:
          preprocessed_data = data_preprocessing(initData)
          normalized_clean_samples, normalized_defect_samples = data_separation(preprocessed_data)
          if len(normalized_defect_samples) > 1 and len(normalized_clean_samples) > 1:
            trainNum = len(preprocessed_data)
            print('num clean: ', len(normalized_clean_samples))
            print('num defect: ', len(normalized_defect_samples))
            flag = True
            break
    if flag:
      break

    startNum = endNum
    endNum += 5
  print(len(initData), ' - ', rowNum, ' - ', trainNum)

  #----------------------------------------------------------------------------- Initial modeling
  testCount = rowNum+vl
  trainData = pd.DataFrame()
  testData = pd.concat(df_test[:testCount+1], ignore_index=True)
  print('len testData:',len(testData))
  if len(testData) > 1:
    gmeans_temp = []
    print('len testData:',len(testData))
    testNum = len(testData)
    initial_evaluation(testData)
    recalls[0].pop(0)
    recalls[1].pop(0)
    if len(recalls[1])>0:    defect_accsMean = sum(recalls[1]) / len(recalls[1])
    if len(recalls[0])>0:  clean_accsMean = sum(recalls[0]) / len(recalls[0])
    print("idefect_accsMean: ", defect_accsMean, " - defect_accs: ", recalls[1])
    print("iclean_accsMean:", clean_accsMean, " - clean_accs:", recalls[0])
    if N[0]>0 and N[1]>0:
      for j in range(len(temp[0])):
        gmeans_temp.append(math.sqrt(temp[0][j] * temp[1][j]))
    print('N: ', N)


  init_recalls0_len = len(recalls[0])
  init_recalls1_len = len(recalls[1])
  temp[0] = []
  temp[1] = []

  print('init testCount: ', testCount)
  print('init train num: ', trainNum)
  print('init test num: ', testNum)
  print('init recall num: ', recallNum)
  #----------------------------------------------------------------------------- Training model
  online_bagging_classifier = create_initial_model(preprocessed_data)
  if interpret_defectPredict: explainer = shap.KernelExplainer(online_bagging_classifier.predict, normalized_defect_samples)
  else: explainer = shap.KernelExplainer(online_bagging_classifier.predict, normalized_clean_samples)

  times = []
  trCln = 0
  defect_ratio = 0
  predictions = []
  clnNum = 0
  defNum = 0
  #----------------------------------------------------------------------------- Interpretation variables
  best_feature = 'best_feature'
  last_NS = pd.DataFrame() #last normalized_samples
  counterCD = 0
  cd_points = []
  df_shap = pd.DataFrame(columns=normalized_clean_samples.columns)
  ind_shap = 0
  #-----------------------------------------------------------------------------
  for i in range(rowNum+1, len(df_test) - 1 - vl):
    if i%100 == 0:
      all_time = int(time.time()- start_time)
      gmeans_temp = []
      print(f"Execution time: {all_time} seconds")
      if len(temp[0])>0: times.append(all_time)
      start_time = time.time()
      print(i, ' ******************** - ', target_fileName, ': ', len(temp[0]), ' - ', len(temp[1]) )
      if len(temp[1])>0:    defect_accsMean = sum(temp[1]) / len(temp[1])
      if len(temp[0])>0:  clean_accsMean = sum(temp[0]) / len(temp[0])
      print("defect_accsMean: ", defect_accsMean, " - defect_accs: ", temp[1])
      print("clean_accsMean:", clean_accsMean, " - clean_accs:", temp[0])
      temp[0] = []
      temp[1] = []
    trainData = updatemodel(UpMflag, trainData, i)
    #---------------------------------------------------------------------------- Adapting model
    if len(trainData) > trainData_len:
      preprocessed_data = data_preprocessing(trainData)
      normalized_clean_samples, normalized_defect_samples = data_separation(preprocessed_data)
      if interpret_defectPredict: last_NS = pd.concat([last_NDS, normalized_defect_samples], ignore_index=True)
      else: last_NS = pd.concat([last_NCS, normalized_clean_samples], ignore_index=True)
      online_bagging_classifier = update_model(online_bagging_classifier, preprocessed_data)
      trainData = pd.DataFrame()
      trainNum += len(preprocessed_data)
    #--------------------------------------------------------------------------- Testing model
    testCount += 1
    testData = df_test[testCount]
    if len(testData) > 0:
      preprocessed_data = data_preprocessing(testData)
      evaluation(online_bagging_classifier, preprocessed_data)
      testData = pd.DataFrame()
      testNum += len(preprocessed_data)
#------------------------------------------------------------------------------- Interpretation

      X_test = preprocessed_data.drop('contains_bug', axis=1)
      shap_values = explainer.shap_values(X_test)
      feature_importance = np.round(np.mean(np.abs(shap_values), axis=0),2)
      feature_indices = np.argsort(feature_importance)[::-1]
      num_top_features = 2
      if best_feature == 'best_feature':
        best_feature = X_test.columns[feature_indices[0]]
      for j in feature_indices[:num_top_features]:
        print(f"Feature {X_test.columns[j]}: Importance = {feature_importance[j]:.3f}")
      if not best_feature == X_test.columns[feature_indices[0]] and len(last_NS)>0:
        counterCD += 1
        explainer = shap.KernelExplainer(online_bagging_classifier.predict, last_NS)
        print('************************* Explainer is updated')
        print('len last_NS: ', len(last_NS))
        cd_points.append(testNum)
        best_feature = X_test.columns[feature_indices[0]]
        last_NS = pd.DataFrame()
      new_row_df = pd.DataFrame(shap_values, columns=df_shap.columns)
      df_shap = pd.concat([df_shap, new_row_df], ignore_index=True)
#------------------------------------------------------------------------------- Saving interpretation result files
  print('counterCD for ', target_fileName, ': ', counterCD)
  csvFolder_name = output_dir + target_fileName + '/shap_result/'
  if not os.path.exists(pkFolder_name):
      os.makedirs(pkFolder_name)
      print(f"Folder '{pkFolder_name}' created successfully.")
  df_shap.to_csv(csvFolder_name + 'shap_oob.csv', index=False)
  with open(csvFolder_name + 'cdPoints_oob', 'wb') as file:
    pickle.dump(cd_points, file)
#-------------------------------------------------------------------------------
  recalls[0].pop(0)
  recalls[1].pop(0)
  defect_accsMean = sum(recalls[1]) / len(recalls[1])
  clean_accsMean = sum(recalls[0]) / len(recalls[0])
  print("defect_accsMean: ", defect_accsMean, " - defect_accs: ", recalls[1])
  print("clean_accsMean:", clean_accsMean, " - clean_accs:", recalls[0])
  gmeans = []
  for i in range(len(recalls[0])):
    gmeans.append(math.sqrt(recalls[0][i] * recalls[1][i]))

  pkFolder_name = output_dir + target_fileName + '/recallFiles/'
  if not os.path.exists(pkFolder_name):
      os.makedirs(pkFolder_name)
      print(f"Folder '{pkFolder_name}' created successfully.")
  with open(pkFolder_name + pklFile_rec, 'wb') as file:
      pickle.dump(recalls, file)

  gmeans_avg = sum(gmeans) / len(gmeans)
  time_avg = sum(times)/len(times)
  print("gmeans_avg:", gmeans_avg)
  print('target_fileName: ', target_fileName)
  print('time_avg: ', math.floor(time_avg + 0.5), ' second')
  print('remained testData: ', len(testData))
  print('init_recalls0_len: ', init_recalls0_len)
  print('init_recalls1_len: ', init_recalls1_len)
  print('recalls0_len: ', len(recalls[0]))
  print('recalls1_len: ', len(recalls[1]))
  print('end train num: ', trainNum)
  print('end test num: ', testNum)
  print('end recall num: ', recallNum)
  print('All Time = ', math.floor(int(time.time() - all_start_time)/60 + 0.5), ' minutes')

In [None]:
import matplotlib.pyplot as plt
plt.plot(recalls[1], 'ro', linestyle='dotted')
#plt.plot(accs, 'bo', linestyle='dotted')
plt.plot(recalls[0], 'go', linestyle='dotted')
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Line Diagram for Array')
plt.show()

In [None]:
import matplotlib.pyplot as plt

arr = []
arr.append(recalls[1])
arr.append(recalls[0])
arr.append(gmeans)
plt.boxplot(arr)
#plt.ylim(0, 1)

In [None]:
from google.colab import drive
drive.mount('/content/drive')