In [1]:
#removing outliers from dataframe using IQR method
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
df_data = pd.read_csv('China.csv',  header=0)

In [4]:
df_data.head()

Unnamed: 0,AFP,Input,Output,Enquiry,File,Interface,Added,Changed,Deleted,PDR_AFP,PDR_UFP,NPDR_AFP,NPDU_UFP,Resource,Dev.Type,Duration,N_effort,Effort
0,1587,774,260,340,128,0,1502,0,0,4.7,5.0,4.7,5.0,4,0,4.0,7490,7490
1,260,9,4,3,193,41,51,138,61,16.0,16.6,16.0,16.6,2,0,17.0,4150,4150
2,152,25,33,28,42,35,163,0,0,4.4,4.1,4.4,4.1,1,0,9.0,668,668
3,252,151,28,8,39,0,69,153,4,12.8,14.3,15.5,17.3,1,0,4.0,3901,3238
4,292,93,0,194,20,0,0,307,0,10.3,9.8,12.4,11.7,1,0,13.0,3607,2994


In [5]:
df_data.shape

(499, 18)

In [6]:
X = df_data.iloc[:,0:17]
y = df_data.iloc[:,17:18]

In [7]:
# evaluate model performance with outliers removed using isolation forest
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut

# Models

In [8]:
lr = LinearRegression(fit_intercept = True, normalize = True, copy_X = True)
lasso = Lasso(alpha = 100)
ridge = Ridge(alpha = 100)
knn = KNeighborsRegressor(metric = 'euclidean', n_neighbors = 8, weights = 'uniform')
dt = DecisionTreeRegressor(max_depth = 9, min_samples_split = 2, random_state = 11)
rf = RandomForestRegressor(max_depth = 5, n_estimators = 40, random_state = 1)
svr = LinearSVR(C = 0.1, epsilon = 1, random_state = 1)
models = [lr, lasso, ridge, knn, dt, rf, svr]

# Individual Models

In [9]:
MAE_Final, MSE_Final = ([] for i in range(2))
for model in models:
    cv = LeaveOneOut()
    MAE_Fold, MSE_Fold = ([] for i in range(2))
    for train_index, test_index in cv.split(X):
        X_train, X_test, y_train, y_test = X.values[train_index], X.values[test_index], y.values[train_index], y.values[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        MAE_Fold.append(mean_absolute_error(y_test, y_pred))
        MSE_Fold.append(mean_squared_error(y_test, y_pred))
    MAE_Final.append(np.mean(MAE_Fold))
    MSE_Final.append(np.mean(MSE_Fold))

# Errors

In [10]:
df_pred = pd.DataFrame(list(zip(MAE_Final, MSE_Final)),
               columns =['MAE', 'MSE'])
df_pred.to_csv('Errors.csv', index = False)

df_pred

Unnamed: 0,MAE,MSE
0,375.995647,1110549.0
1,324.120354,989479.0
2,347.112299,939668.2
3,403.006513,1903173.0
4,569.018227,3895305.0
5,425.898757,1904900.0
6,314.657305,1183064.0
