In [1]:
from raise_utils.data import DataLoader
import glob
import numpy as np

In [2]:
def get_defect_prediction_datasets():
    base_path = '../Dodge/data/defect/'
    files = glob.glob(f'{base_path}*-*.csv')
    
    return files

In [3]:
def get_issue_lifetime_datasets():
    base_path = '../Dodge/data/issue_close_time/'
    files = glob.glob(base_path + '**/*.csv')
    
    return files

In [4]:
def get_smell_datasets():
    base_path = '../Dodge/data/smell/'
    files = glob.glob(base_path + '*.csv')
    
    return files

In [5]:
def get_uci_datasets():
    base_path = '../Dodge/data/UCI/'
    files = glob.glob(base_path + '*.csv')
    
    return files

## Test sufficient statistic a good estimate

In [20]:
no_mean = 0
yes_mean = 0
no_std = 0
yes_std = 0
for dataset in get_defect_prediction_datasets():
    data = DataLoader.from_file(dataset)
    data.x_train = np.array(data.x_train)
    data.x_test = np.concatenate((data.x_train, np.array(data.x_test)))
    
    m0 = np.mean(data.x_train, axis=0)
    m1 = np.mean(data.x_test, axis=0)
    
    s0 = np.std(data.x_train, axis=0)
    s1 = np.std(data.x_test, axis=0)
    
    res_mean = np.abs((m0 - m1) / m1) <= 0.1
    if sum(res_mean) / len(res_mean) < 0.7:
        no_mean += 1
    else:
        yes_mean += 1
    
    res_std = np.abs((s0 - s1) / s1) <= 0.1
    if sum(res_std) / len(res_std) < 0.7:
        no_std += 1
    else:
        yes_std += 1

print('Mean\n---\nYes:', round(100 * yes_mean / (yes_mean + no_mean), 2))
print('No:', round(100 * no_mean / (yes_mean + no_mean), 2))

print('\nstd\n---\nYes:', round(100 * yes_std / (yes_std + no_std), 2))
print('No:', round(100 * no_std / (yes_std + no_std), 2))

Mean
---
Yes: 97.37
No: 2.63

std
---
Yes: 81.58
No: 18.42


In [22]:
no_mean = 0
yes_mean = 0
no_std = 0
yes_std = 0
for dataset in get_issue_lifetime_datasets():
    data = DataLoader.from_file(dataset, target='timeOpen', col_start=0)
    data.x_train = np.array(data.x_train)
    data.x_test = np.concatenate((data.x_train, np.array(data.x_test)))
    
    m0 = np.mean(data.x_train, axis=0)
    m1 = np.mean(data.x_test, axis=0)
    
    s0 = np.std(data.x_train, axis=0)
    s1 = np.std(data.x_test, axis=0)
    
    res_mean = np.abs((m0 - m1) / m1) <= 0.1
    if sum(res_mean) / len(res_mean) < 0.7:
        no_mean += 1
    else:
        yes_mean += 1
    
    res_std = np.abs((s0 - s1) / s1) <= 0.1
    if sum(res_std) / len(res_std) < 0.7:
        no_std += 1
    else:
        yes_std += 1

print('Mean\n---\nYes:', round(100 * yes_mean / (yes_mean + no_mean), 2))
print('No:', round(100 * no_mean / (yes_mean + no_mean), 2))

print('\nstd\n---\nYes:', round(100 * yes_std / (yes_std + no_std), 2))
print('No:', round(100 * no_std / (yes_std + no_std), 2))

  res_mean = np.abs((m0 - m1) / m1) <= 0.1
  res_std = np.abs((s0 - s1) / s1) <= 0.1


Mean
---
Yes: 100.0
No: 0.0

std
---
Yes: 100.0
No: 0.0


In [23]:
no_mean = 0
yes_mean = 0
no_std = 0
yes_std = 0
for dataset in get_smell_datasets():
    data = DataLoader.from_file(dataset, target='SMELLS', col_start=0)
    data.x_train = np.array(data.x_train)
    data.x_test = np.concatenate((data.x_train, np.array(data.x_test)))
    
    m0 = np.mean(data.x_train, axis=0)
    m1 = np.mean(data.x_test, axis=0)
    
    s0 = np.std(data.x_train, axis=0)
    s1 = np.std(data.x_test, axis=0)
    
    res_mean = np.abs((m0 - m1) / m1) <= 0.1
    if sum(res_mean) / len(res_mean) < 0.7:
        no_mean += 1
    else:
        yes_mean += 1
    
    res_std = np.abs((s0 - s1) / s1) <= 0.1
    if sum(res_std) / len(res_std) < 0.7:
        no_std += 1
    else:
        yes_std += 1

print('Mean\n---\nYes:', round(100 * yes_mean / (yes_mean + no_mean), 2))
print('No:', round(100 * no_mean / (yes_mean + no_mean), 2))

print('\nstd\n---\nYes:', round(100 * yes_std / (yes_std + no_std), 2))
print('No:', round(100 * no_std / (yes_std + no_std), 2))

Mean
---
Yes: 0.0
No: 100.0

std
---
Yes: 0.0
No: 100.0


  res_mean = np.abs((m0 - m1) / m1) <= 0.1
  res_std = np.abs((s0 - s1) / s1) <= 0.1


## Test if Gaussian

In [24]:
import scipy.stats as st

In [31]:
no = 0
yes = 0
for dataset in get_defect_prediction_datasets():
    data = DataLoader.from_file(dataset)
    data.x_train = np.array(data.x_train)
    data.x_test = np.concatenate((data.x_train, np.array(data.x_test)))

    for k in range(data.x_test.shape[1]):
        if st.ks_1samp(data.x_test[:,k], st.norm.cdf)[1] <= 0.1:
            no += 1
        else:
            yes += 1

print('Yes:', round(100 * yes / (yes + no), 2))
print('No:', round(100 * no / (yes + no), 2))

Yes: 0.0
No: 100.0


In [32]:
no = 0
yes = 0
for dataset in get_issue_lifetime_datasets():
    data = DataLoader.from_file(dataset, target='timeOpen', col_start=0)
    data.x_train = np.array(data.x_train)
    data.x_test = np.concatenate((data.x_train, np.array(data.x_test)))

    for k in range(data.x_test.shape[1]):
        if st.ks_1samp(data.x_test[:,k], st.norm.cdf)[1] <= 0.1:
            no += 1
        else:
            yes += 1

print('Yes:', round(100 * yes / (yes + no), 2))
print('No:', round(100 * no / (yes + no), 2))

Yes: 0.0
No: 100.0


In [33]:
no = 0
yes = 0
for dataset in get_smell_datasets():
    data = DataLoader.from_file(dataset, target='SMELLS', col_start=0)
    data.x_train = np.array(data.x_train)
    data.x_test = np.concatenate((data.x_train, np.array(data.x_test)))

    for k in range(data.x_test.shape[1]):
        if st.ks_1samp(data.x_test[:,k], st.norm.cdf)[1] <= 0.1:
            no += 1
        else:
            yes += 1

print('Yes:', round(100 * yes / (yes + no), 2))
print('No:', round(100 * no / (yes + no), 2))

Yes: 0.0
No: 100.0
