# Decision Tree/Decision Rules

TODO

## Tutorial

### Load Data and preparing datasets

In [1]:
# Import for Load Data
from os import listdir
from os.path import isfile, join
import pandas as pd
# Import for Split Data into Training and Testing Samples
from sklearn.model_selection import train_test_split

train_dataset = pd.read_csv(("../../datasets/lucene-2.9.0.csv"), index_col = 'File')
test_dataset = pd.read_csv(("../../datasets/lucene-3.0.0.csv"), index_col = 'File')

outcome = 'RealBug'
features = ['OWN_COMMIT', 'Added_lines', 'CountClassCoupled', 'AvgLine', 'RatioCommentToCode']

# process outcome to 0 and 1
train_dataset[outcome] = pd.Categorical(train_dataset[outcome])
train_dataset[outcome] = train_dataset[outcome].cat.codes

test_dataset[outcome] = pd.Categorical(test_dataset[outcome])
test_dataset[outcome] = test_dataset[outcome].cat.codes

X_train = train_dataset.loc[:, features]
X_test = test_dataset.loc[:, features]

y_train = train_dataset.loc[:, outcome]
y_test = test_dataset.loc[:, outcome]


# commits - # of commits that modify the file of interest
# Added lines - # of added lines of code
# Count class coupled - # of classes that interact or couple with the class of interest
# LOC - # of lines of code
# RatioCommentToCode - The ratio of lines of comments to lines of code
features = ['nCommit', 'AddedLOC', 'nCoupledClass', 'LOC', 'CommentToCodeRatio']

X_train.columns = features
X_test.columns = features
training_data = pd.concat([X_train, y_train], axis=1)
testing_data = pd.concat([X_test, y_test], axis=1)

### Construct a black-box model (Regression and Random Forests)

In [2]:
# Import for Construct a black-box model (Regression and Random Forests)
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# regression (ols)
model_formula = outcome + ' ~ ' + ' + '.join(features)
regression_model = ols(model_formula, data = training_data)
# regression_model = sm.OLS(y_train, sm.add_constant(X_train))
regression_model_fit = regression_model.fit()

# regression (logistic regression)
lr_model = LogisticRegression(fit_intercept = True)
lr_model.fit(X_train, y_train)  

# random forests
rf_model = RandomForestClassifier(random_state=1234, n_jobs = 10)
rf_model.fit(X_train, y_train)  

RandomForestClassifier(n_jobs=10, random_state=1234)

### Cross-release model validation for sanity check (Train with 2.9.0 and test with 3.0.0)

In [3]:
from sklearn import metrics

In [4]:
def get_model_performance(model, X_test, y_test):
    model_prediction = model.predict_proba(X_test.values)[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, model_prediction, pos_label=1)
    auc_value = metrics.auc(fpr, tpr)
    transformed_prediction = [int(i >= 0.5) for i in model_prediction]
    f1_value = metrics.f1_score(y_test, transformed_prediction)
    return auc_value, f1_value

In [5]:
print('Regression (AUC, F1):', get_model_performance(lr_model, X_test, y_test))
print('Random Forests (AUC, F1):', get_model_performance(rf_model, X_test, y_test))

Regression (AUC, F1): (0.7989602095955461, 0.23236514522821577)
Random Forests (AUC, F1): (0.8589023524916761, 0.4951456310679611)


  """
  """


## Model Explanation

### Decision Rules/Decision Trees

In [16]:
# Import for Decision Rules/Decision Trees
from sklearn import tree
from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
import collections

In [17]:
# construct a decision rules/decision trees model
dt_model = tree.DecisionTreeClassifier(random_state=1234, max_depth=2)
dt_model.fit(X_train, y_train)  
dt_text = tree.export_text(dt_model, 
                          feature_names = features)
  

# visualize 
print(dt_text)

|--- nCoupledClass <= 5.50
|   |--- AddedLOC <= 145.50
|   |   |--- class: 0
|   |--- AddedLOC >  145.50
|   |   |--- class: 0
|--- nCoupledClass >  5.50
|   |--- AddedLOC <= 113.00
|   |   |--- class: 0
|   |--- AddedLOC >  113.00
|   |   |--- class: 1

