In [1]:
import pandas as pd 
import numpy as np
import math
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import explained_variance_score
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
Qcc_FI = pd.read_csv('Qcc_quant_after_FI.csv')

In [4]:
Qcc_quant = pd.read_csv('Qcc_quant.csv')

In [5]:
TD = pd.read_csv('TD.csv')

In [49]:
PCS = pd.read_csv('PCA.csv')

In [50]:
test = pd.read_csv('TEST_FI.csv')

In [6]:
X_train_sq, X_test_sq, y_train_sq, y_test_sq = train_test_split(Qcc_FI, TD['SonarQube_debt'], test_size=0.16, random_state=3, stratify=Qcc_FI['Domain'])

In [7]:
X_train_sq.groupby('Domain')['Project'].nunique()

Domain
3D/graphics/media                        6
IDE                                      2
SDK                                      5
database                                 7
diagram generator/data visualization     8
games                                    3
middleware                              16
parsers/generators/make                  8
programming language                     2
testing                                 10
tool                                    24
Name: Project, dtype: int64

In [8]:
X_test_sq.groupby('Domain')['Project'].nunique()

Domain
3D/graphics/media                       1
IDE                                     1
SDK                                     1
database                                1
diagram generator/data visualization    2
middleware                              3
parsers/generators/make                 1
programming language                    1
testing                                 2
tool                                    5
Name: Project, dtype: int64

In [9]:
def big_data_pipe(data):
    data.iloc[:,2:] = data.iloc[:,2:].apply(stats.zscore)
    return data.iloc[:,2:]

In [10]:
SQ_X_train = big_data_pipe(X_train_sq)

In [11]:
SQ_X_train

Unnamed: 0,TLOC,LCOM_50,LCOM_std,DIT_50,RMI_40,NOM_20,NOM_99,NOM_std,NSF_std,NSM_99,NBD_80,CA_99,CA_std,PAR_60,PAR_std,NOC_std,RMA_01,NORM_60,NORM_80,NORM_99,MLOC_50,NSC_std,RMD_60,RMD_std,NOF_60,NOF_std
92,-0.071155,-0.147784,-0.991711,0.450242,-0.564637,-0.347704,-0.449077,-0.991711,0.214044,-0.732321,0.149906,0.396222,0.464885,0.142054,-0.656178,-0.303315,-0.109981,-0.317500,0.321305,-0.438751,0.893446,-0.098825,0.556950,-1.285050,-1.259058,-0.235794
56,-0.096762,-0.147784,0.633661,-0.791334,-0.277626,-0.347704,-0.229843,0.633661,-0.336903,0.761224,0.149906,0.676898,0.877306,0.142054,0.043612,0.613274,-0.109981,2.571753,0.321305,0.315038,0.893446,-0.092407,0.927987,-0.504262,1.204905,-0.177396
46,2.440925,-0.147784,-0.505334,-0.791334,1.309966,-0.347704,-0.284651,-0.505334,-0.419164,-0.737165,0.149906,-0.707006,-0.733406,0.142054,-0.347755,-0.369018,-0.109981,-0.317500,-1.440066,-0.438751,-0.893446,-0.180317,-0.204874,-0.337146,-0.027077,-0.540261
107,0.078773,-0.147784,0.349526,-0.791334,-0.142149,-0.347704,0.082018,0.349526,0.247495,0.465745,0.149906,-0.136908,-0.129939,0.142054,0.280613,-0.134653,-0.109981,-0.317500,0.321305,1.575751,-0.893446,-0.139507,-0.162550,-0.050072,-0.027077,0.145418
77,-0.043458,-0.147784,1.014055,-0.791334,-0.446220,-0.347704,0.436629,1.014055,0.130913,2.220862,0.149906,0.035605,-0.061280,0.142054,5.370004,-0.270134,-0.109981,-0.317500,-1.440066,-0.438751,-0.893446,-0.137398,0.022263,0.623985,-0.027077,0.198055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,-0.584925,-0.147784,0.786047,-0.791334,1.309966,-0.347704,-0.192573,0.786047,-0.476432,-0.091307,0.149906,-0.418821,-0.417298,0.142054,-0.735207,-0.392456,-0.109981,-0.317500,-1.440066,-0.627198,-0.893446,-0.173508,-1.143046,-1.626847,-1.259058,-0.440165
13,-0.454404,-0.147784,-2.178834,-0.791334,-0.883762,-0.347704,-0.264737,-2.178834,-0.149270,3.196107,0.149906,-0.482730,-0.349425,0.142054,1.164486,-0.161380,-0.109981,-0.317500,0.321305,1.651130,0.000000,0.072569,-0.451761,0.260045,-1.259058,-0.515679
93,-0.498957,-0.147784,-0.945668,1.691818,0.253244,-0.347704,0.926983,-0.945668,-0.483772,-0.414236,2.423486,-0.323643,0.147140,0.142054,-1.356443,1.194545,-0.109981,5.461005,2.082676,4.291277,0.000000,0.032527,0.386245,0.572376,-0.027077,-0.491543
40,-0.520733,-0.147784,1.507030,0.450242,-0.782405,-0.347704,-0.207919,1.507030,-0.384656,-1.060093,0.149906,-0.624371,-0.651660,0.142054,1.205298,-0.581927,-0.109981,-0.317500,0.321305,-0.438751,-0.893446,-0.263511,0.973132,0.960284,1.204905,-0.378224


In [12]:
SQ_X_test = big_data_pipe(X_test_sq)
SQ_X_test.isnull().values.any()
SQ_X_test =SQ_X_test.fillna(0.0)

In [13]:
SQ_X_test

Unnamed: 0,TLOC,LCOM_50,LCOM_std,DIT_50,RMI_40,NOM_20,NOM_99,NOM_std,NSF_std,NSM_99,NBD_80,CA_99,CA_std,PAR_60,PAR_std,NOC_std,RMA_01,NORM_60,NORM_80,NORM_99,MLOC_50,NSC_std,RMD_60,RMD_std,NOF_60,NOF_std
0,-0.42418,0.0,1.438152,0.658505,2.256263,-0.353553,0.128365,1.438152,-0.335528,-0.065701,0.242536,0.370269,0.37322,0.0,-0.868244,-0.027279,0.0,2.236068,0.078811,-0.432686,-0.83054,-0.061512,-1.797573,-0.925887,0.0,-0.27664
65,-0.454591,0.0,-0.601719,0.658505,-0.221513,-0.353553,0.060994,-0.601719,-0.423805,-0.425703,0.242536,-0.108505,-0.382543,0.0,-0.55348,-0.162561,0.0,-0.447214,0.078811,-0.806048,1.469416,-0.374467,1.374165,0.234749,0.0,0.115847
21,-1.187936,0.0,-0.09607,-1.034793,-1.332548,-0.353553,-1.951815,-0.09607,-0.232961,0.220502,0.242536,-1.144706,-1.324399,0.0,-0.221584,-0.705538,0.0,-0.447214,0.078811,-0.806048,-0.83054,-0.5559,0.346954,-1.600162,1.224745,1.79977
94,-0.675346,0.0,0.104145,0.658505,0.606064,-0.353553,-0.986997,0.104145,-0.044346,0.706506,0.242536,-0.404098,-0.542176,0.0,0.504087,-0.57851,0.0,-0.447214,0.078811,-0.806048,0.319438,-0.465001,-1.324106,-0.324986,1.224745,-0.64341
90,-1.29827,0.0,1.872776,-1.034793,-0.721033,2.828427,0.219025,1.872776,-0.508378,-1.111509,0.242536,-1.243409,-1.454156,0.0,-1.613033,-0.916431,0.0,-0.447214,0.078811,-0.858319,-0.83054,-0.525871,0.553379,2.628774,2.44949,-0.266884
47,0.199139,0.0,-0.874788,0.658505,-0.221513,-0.353553,2.514625,-0.874788,-0.30407,-0.076501,0.242536,1.017792,1.7711,-3.0,-0.560961,-0.56584,0.0,-0.447214,0.078811,0.687402,0.319438,0.031759,0.555018,0.068176,-1.224745,-0.605145
105,0.636466,0.0,1.731807,0.658505,0.32558,-0.353553,0.726386,1.731807,-0.39646,-0.211502,0.242536,-0.027481,0.867422,0.0,-0.012718,-0.30177,0.0,2.236068,1.49741,0.314039,0.319438,0.241041,-0.50332,-0.972822,0.0,0.050204
7,0.066877,0.0,0.021417,0.658505,-0.221513,-0.353553,-0.604397,0.021417,0.634033,-0.571505,0.242536,-0.532386,-0.558832,0.0,1.252381,0.449524,0.0,-0.447214,0.078811,0.314039,0.319438,-0.322423,-0.350959,0.042941,0.0,3.272874
20,-0.702642,0.0,-1.744488,-1.034793,-1.197757,-0.353553,0.044359,-1.744488,0.239304,-0.931508,0.242536,-0.569033,-0.176876,0.0,-0.267231,1.978313,0.0,-0.447214,-1.339788,-0.806048,-0.83054,-0.236746,1.010464,-0.41543,-1.224745,0.119037
73,1.662986,0.0,-0.127578,0.658505,0.452442,-0.353553,-1.352963,-0.127578,-0.335557,-0.391503,0.242536,1.484248,1.288953,0.0,0.648736,-0.108879,0.0,-0.447214,0.078811,-0.432686,2.619394,0.251128,-0.414853,0.323156,0.0,-0.586082


In [34]:
rf = RandomForestRegressor()

In [35]:
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [36]:
param_grid = {
    'bootstrap': [True, False],
    'max_depth': [10,20,30,40,80, 90],
    'max_features': [1, 2, 3],
    'min_samples_leaf': [1, 2,3, 4],
    'min_samples_split': [2, 4, 6,8],
    'n_estimators': [5, 10, 15, 20 , 25, 30, 40, 100]
}

In [37]:
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [38]:
grid_search.fit(SQ_X_train, y_train_sq)

Fitting 3 folds for each of 4608 candidates, totalling 13824 fits


In [40]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

In [42]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(SQ_X_train, y_train_sq)
base_accuracy = evaluate(base_model, SQ_X_test, y_test_sq)

Model Performance
Average Error: 5609.4164 degrees.
Accuracy = -181.40%.


In [44]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 90,
 'max_features': 3,
 'min_samples_leaf': 4,
 'min_samples_split': 6,
 'n_estimators': 5}

In [45]:
best_grid = grid_search.best_estimator_

In [46]:
grid_accuracy = evaluate(best_grid, SQ_X_test, y_test_sq)

Model Performance
Average Error: 4621.6198 degrees.
Accuracy = -197.26%.


In [47]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Improvement of 8.74%.


In [25]:
rf = RandomForestRegressor(bootstrap = False,
 max_depth = 90,
 max_features = 3,
 min_samples_leaf = 1,
 min_samples_split = 8,
 n_estimators = 5)

In [26]:
rf.fit(SQ_X_train, y_train_sq)

In [27]:
pred = rf.predict(SQ_X_test)

In [28]:
pred

array([ 7260.4429603 ,  1178.13516669,  2549.00249983,  2959.5236668 ,
        1229.74375405,  5442.9515558 ,  4948.90849992,  3774.94973765,
        3356.21753971,  5592.08500012,  8931.57930949,   576.15938892,
         851.31577782,  5260.34322247,   563.17283332,  6049.82763502,
       10959.01003225,  2397.31091277])

In [29]:
np.array(y_test_sq)

array([ 1431.033333 ,  2234.483333 ,   255.8333333,  2178.616667 ,
        2231.583333 ,  2252.066667 , 24019.16667  ,  2999.2      ,
        1330.216667 ,  4865.466667 ,  3305.083333 ,   889.0333333,
        4019.083333 , 26240.56667  ,   490.5      ,  2719.5      ,
        4610.       ,   219.9666667])

In [30]:
# accuracy_score(np.array(y_test_sq), pred, normalize=False)

In [31]:
rmse = math.sqrt(mean_squared_error(y_test_sq,pred))

In [32]:
rmse

7299.801898855659

In [48]:
print("Explain variance score =", round(explained_variance_score(y_test_sq, pred), 2))

Explain variance score = 0.01
