In [24]:
from imblearn.over_sampling import SMOTE, RandomOverSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

import pandas as pd
import numpy as np

In [18]:
def run_experiment(train_fname, test_fname, purge_method, clf):
    path = '../../data/Change-Level-Prediction-Data-20191107T052353Z-001/Change-Level-Prediction-Data/ICSE-2016-PROMISE DefectData/'
    
    # Get the data
    train_df = pd.read_csv(path + train_fname)
    test_df = pd.read_csv(path + test_fname)
    
    # Remove unnecessary columns
    train_df.drop(['name', 'version', 'name.1'], inplace=True, axis=1)
    test_df.drop(['name', 'version', 'name.1'], inplace=True, axis=1)
    
    # Get the X and y for train and test sets
    x_train = train_df.drop('bug', axis=1)
    y_train = train_df['bug']

    x_test = test_df.drop('bug', axis=1)
    y_test = test_df['bug']
    
    if purge_method == 'delete':
        idx = np.where(y_train < 2)[0]
        x_train = np.array(x_train)[idx]
        y_train = y_train[idx]

        idx = np.where(y_test < 2)[0]
        x_test = np.array(x_test)[idx]
        y_test = y_test[idx]
    elif purge_method == 'one':
        y_train[y_train > 1] = 1
        y_test[y_test > 1] = 1
    
    clf.fit(x_train, y_train)
    preds = clf.predict(x_test)
    print(f1_score(y_test, preds))

## ant 1.5 - 1.6

### Naive Bayes

In [19]:
run_experiment('ant-1.5.csv', 'ant-1.6.csv', 'delete', GaussianNB())

0.33333333333333337


In [20]:
run_experiment('ant-1.5.csv', 'ant-1.6.csv', 'one', GaussianNB())

0.5471698113207547


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Random Forest

In [22]:
run_experiment('ant-1.5.csv', 'ant-1.6.csv', 'delete', RandomForestClassifier())

0.10909090909090909




In [23]:
run_experiment('ant-1.5.csv', 'ant-1.6.csv', 'one', RandomForestClassifier())

0.24347826086956523


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Logistic Regression

In [25]:
run_experiment('ant-1.5.csv', 'ant-1.6.csv', 'delete', LogisticRegression())

0.19672131147540983




In [26]:
run_experiment('ant-1.5.csv', 'ant-1.6.csv', 'one', LogisticRegression())

0.3414634146341463


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## camel 1.2 - 1.4

### Naive Bayes

In [27]:
run_experiment('camel-1.2.csv', 'camel-1.4.csv', 'delete', GaussianNB())

0.1267605633802817


In [28]:
run_experiment('camel-1.2.csv', 'camel-1.4.csv', 'one', GaussianNB())

0.34576271186440677


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Random Forest

In [29]:
run_experiment('camel-1.2.csv', 'camel-1.4.csv', 'delete', RandomForestClassifier())

0.24203821656050956




In [30]:
run_experiment('camel-1.2.csv', 'camel-1.4.csv', 'one', RandomForestClassifier())

0.4900284900284901


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Logistic Regression

In [31]:
run_experiment('camel-1.2.csv', 'camel-1.4.csv', 'delete', LogisticRegression())

0.08888888888888889




In [32]:
run_experiment('camel-1.2.csv', 'camel-1.4.csv', 'one', LogisticRegression())

0.3134328358208956


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Clearly, it is a closer match to the paper's results when we one them out rather than purge the rows.