In [1]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas
import numpy
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

# Evaluation Metrics
from sklearn import model_selection
from sklearn.metrics import make_scorer

# Libraries for visuals
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.2)

# load data
# Read data from .csv and drop the column pID - cocomo341_log, nasa93_log
dataframe = pandas.read_csv('cocomo341_log.csv', header=0).drop(labels='pID', axis=1)

# Feature names
# names = ["LogSIZE","PMAT","PREC","TEAM","FLEX","RESL","PCAP","RELY","CPLX","TIME","STOR","ACAP","PLEX","LTEX","DATA","RUSE","DOCU","PVOL","APEX","PCON","TOOL","SITE","SCED"]
names = numpy.array(dataframe.columns.values)[1:]

# Total number of features
totalFeatureNum = names.size
dataframe

Unnamed: 0,LogPM,LogSIZE,PMAT,PREC,TEAM,FLEX,RESL,PCAP,RELY,CPLX,...,LTEX,DATA,RUSE,DOCU,PVOL,APEX,PCON,TOOL,SITE,SCED
0,5.049856,4.684813,0.146166,0.232367,0.102597,0.189735,0.132580,-0.274437,0.000000,-0.139262,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.105361,0.086178,-0.150823,0.000000
1,9.341369,5.942274,0.463497,0.294737,0.195501,0.301273,0.335738,0.000000,0.095310,0.157004,...,0.000000,0.246860,0.000000,0.000000,0.139762,-0.127833,0.113329,0.157004,0.000000,0.131028
2,4.804021,2.624669,0.163779,0.032546,0.000000,0.000000,0.074278,0.292670,-0.083382,0.000000,...,-0.094311,0.000000,0.000000,0.000000,0.000000,-0.127833,0.113329,0.157004,0.000000,0.000000
3,7.620705,4.901341,0.382305,0.243107,0.214679,0.248498,0.346525,0.139762,-0.083382,-0.314711,...,0.000000,0.246860,0.067659,0.000000,0.139762,0.000000,0.000000,0.157004,-0.072571,0.000000
4,6.864848,3.617920,0.225758,0.179449,0.198262,0.183429,0.255787,0.000000,0.095310,0.292670,...,0.086178,-0.105361,0.000000,0.000000,0.000000,-0.210721,0.000000,0.157004,0.086178,0.131028
5,3.496508,2.772589,0.173010,0.034380,0.030498,0.000000,0.156651,-0.127833,-0.083382,0.000000,...,-0.094311,-0.105361,0.067659,0.000000,-0.139262,-0.127833,0.000000,0.157004,-0.072571,0.000000
6,2.079442,1.824549,0.085389,0.000000,0.020070,0.000000,0.077361,-0.274437,0.000000,0.157004,...,-0.094311,0.000000,0.000000,0.000000,-0.139262,-0.210721,-0.105361,0.157004,-0.072571,0.000000
7,8.764053,5.840380,0.455550,0.072421,0.127904,0.177548,0.247632,0.072321,0.231112,0.292670,...,0.135405,0.131028,0.000000,0.000000,0.139762,-0.210721,0.000000,0.157004,0.000000,0.131028
8,6.980076,3.413126,0.266224,0.169291,0.187039,0.138232,0.192842,0.000000,0.095310,0.292670,...,0.182322,-0.105361,0.000000,0.000000,0.262364,-0.210721,0.000000,0.157004,0.000000,0.131028
9,3.713572,2.426218,0.151396,0.000000,0.000000,0.000000,0.034210,0.000000,-0.083382,-0.139262,...,0.135405,-0.105361,0.000000,0.000000,0.139762,-0.127833,0.000000,0.157004,0.000000,0.000000


In [2]:
array = numpy.array(dataframe.values)
numpy.set_printoptions(precision=5, suppress=True)
array

array([[ 5.04986,  4.68481,  0.14617, ...,  0.08618, -0.15082,  0.     ],
       [ 9.34137,  5.94227,  0.4635 , ...,  0.157  ,  0.     ,  0.13103],
       [ 4.80402,  2.62467,  0.16378, ...,  0.157  ,  0.     ,  0.     ],
       ...,
       [ 3.68492,  3.30512,  0.18046, ..., -0.24846, -0.22314,  0.35767],
       [ 3.90608,  4.19615,  0.19638, ..., -0.10536, -0.07257,  0.13103],
       [ 3.75074,  3.69633,  0.20182, ..., -0.24846, -0.22314,  0.13103]])

In [3]:
# Number of rows
numpy.size(array,0)

341

In [4]:
# Number of columns
numpy.size(array,1)

24

In [5]:
# Features
X = array[:,1:]


In [6]:
# Actual Value
Y = array[:,0]


In [7]:
# Define Scoring Function
def mae_score_func(y, y_pred):
    return numpy.amin(numpy.absolute(1 - numpy.exp(y_pred - y)))



In [8]:
# feature extraction
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=None)
model = SVR(kernel='linear', C=1000)

best_score = -numpy.inf
best_subset = None
best_fit = None

for n in range(totalFeatureNum, 0, -1):
    rfe = RFE(model, n, step=1)
    fit = rfe.fit(X, Y)
    features = fit.transform(X)
    print(numpy.size(features,1))
    # scoring = 'neg_mean_absolute_error'
    # scoring = 'neg_mean_squared_log_error'   
    scoring = make_scorer(mae_score_func, greater_is_better=False, needs_proba=False)
    results = model_selection.cross_val_score(model, features, Y, cv=kfold, scoring=scoring)

    # print MAE results
    print("MAE for {} feature is {}; Std {}".format(numpy.size(features,1), results.mean(), results.std()))
    if best_score < results.mean():
        best_score = results.mean()
        best_subset = features
        best_fit = fit

23
MAE for 23 feature is -0.014880541239651813; Std 0.015924330199247327
22
MAE for 22 feature is -0.019179039226553563; Std 0.020553529956833225
21
MAE for 21 feature is -0.017694869181858086; Std 0.020279669724850343
20
MAE for 20 feature is -0.020984167533753085; Std 0.021093435904873118
19
MAE for 19 feature is -0.015907291503044884; Std 0.020284820582090538
18
MAE for 18 feature is -0.025571867193487162; Std 0.022396634038882743
17
MAE for 17 feature is -0.011631161129860412; Std 0.01502828958590446
16
MAE for 16 feature is -0.01792768934851362; Std 0.013193980003673202
15
MAE for 15 feature is -0.02301419701297658; Std 0.023913373744754392
14
MAE for 14 feature is -0.017851489232979113; Std 0.021651912358594982
13
MAE for 13 feature is -0.01384520911244943; Std 0.00945259245903579
12
MAE for 12 feature is -0.044917948282608755; Std 0.04604022557672574
11
MAE for 11 feature is -0.0514213476451563; Std 0.03741709874235908
10
MAE for 10 feature is -0.04831268226593351; Std 0.0333227

In [9]:
best_score
#print("Num Features: %d") % fit.n_features_
#fit.n_features_
#print("Selected Features: %s") % fit.support_
#fit.support_
#print("Feature Ranking: %s") % fit.ranking_
#fit.ranking_

-0.011631161129860412

In [10]:
# Num Features
best_fit.n_features_

17

In [11]:
# Selected Features
best_fit.support_

array([ True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True, False,  True,  True, False,  True,  True, False,
       False,  True,  True,  True,  True])

In [12]:
# Feature Ranking
best_fit.ranking_

array([1, 1, 1, 7, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 5, 1, 1, 4, 6, 1, 1, 1,
       1])

In [13]:
# Fit the SVR model
svr_lin = SVR(kernel='linear', C=1000)
y_lin = svr_lin.fit(best_subset, Y).predict(best_subset)

numpy.set_printoptions(precision=5)


In [14]:
best_subset

array([[ 4.68481,  0.14617,  0.23237, ...,  0.08618, -0.15082,  0.     ],
       [ 5.94227,  0.4635 ,  0.29474, ...,  0.157  ,  0.     ,  0.13103],
       [ 2.62467,  0.16378,  0.03255, ...,  0.157  ,  0.     ,  0.     ],
       ...,
       [ 3.30512,  0.18046,  0.08197, ..., -0.24846, -0.22314,  0.35767],
       [ 4.19615,  0.19638,  0.02602, ..., -0.10536, -0.07257,  0.13103],
       [ 3.69633,  0.20182,  0.04583, ..., -0.24846, -0.22314,  0.13103]])

In [15]:
### Linear SVR

In [16]:
# MRE
MRE = 0

for i in range(len(Y)):
    #MRE = MRE + numpy.absolute((Y[i] - y_lin[i])/Y[i])
    #MRE = numpy.absolute(1- numpy.log(numpy.absolute((Y[i] - y_lin[i])/Y[i])))  
    MRE = MRE + numpy.absolute(1- numpy.exp(y_lin[i] - Y[i]))

# MMRE
MMRE = MRE / numpy.size(array, 0)

print(round(MMRE,2))

    

0.44


In [17]:
# PRED
count = 0

MRE = 0


N = 30
for i in range(len(Y)):
    #MRE = numpy.absolute(1- numpy.log(numpy.absolute((Y[i] - y_lin[i])/Y[i])))
    MRE = numpy.absolute(1- numpy.exp(y_lin[i] - Y[i]))

    if MRE <= N/100:
        count = count + 1
 
PRED = 100 * (count / len(Y))

print(round(PRED,2))


55.43


In [18]:
# MAR
Total_AR = 0
for i in range(len(Y)):
    #Total_AR = Total_AR + numpy.absolute(numpy.exp(Y[i] - y_lin[i]))
    Total_AR = Total_AR + numpy.absolute(numpy.exp(Y[i]) - numpy.exp(y_lin[i]))
MAR = Total_AR / numpy.size(array, 0)
print(round(MAR, 2))

174.63


In [19]:
# selected features
for n in range(0, numpy.size(best_fit.support_)):
    if best_fit.support_[n]:
        print(names[n])

LogSIZE
PMAT
PREC
FLEX
PCAP
RELY
CPLX
TIME
STOR
PLEX
LTEX
RUSE
DOCU
PCON
TOOL
SITE
SCED


In [20]:
svr_lin.coef_

array([[ 0.93544,  1.03078,  1.11786,  0.69952,  2.23889,  1.0189 ,
         1.10647,  1.6264 ,  0.76849,  0.90833,  1.45442, -1.36436,
         2.968  ,  0.69904,  0.99482,  1.04591,  1.33021]])