# Baseline Model Performance

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [2]:
#read file

url = 'http://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = pd.read_csv(url,header=None)

#retrive the array

data = df.values

#splite into input and output elements

X , y = data[:,:-1],data[:,-1]

In [7]:
#split into train test sets

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=1)

#fit the model

model = LinearRegression()
model.fit(X_train,y_train)

#evaluate the model

yhat = model.predict(X_test)

#evaluate prediction

mea = mean_absolute_error(y_test,yhat)
print('MAE:%.3F'%mea)

MAE:3.417


# Isolation Forest

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import IsolationForest

In [3]:
#import and read file

url = 'http://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = pd.read_csv(url,header=None)
data = df.values

#dependent and independent variables

X,y = data[:,:-1],data[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=1)

print(X_train.shape,y_train.shape)

#finding outliers 

iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)

#select all rows that are not outliers

mask = yhat != -1
X_train,y_train = X_train[mask,:],y_train[mask]

#shape of updated training sets

print(X_train.shape,y_train.shape)

#fit the model

model= LinearRegression()
model.fit(X_train,y_train)

yhat = model.predict(X_test)

#evaluate prediction

mae = mean_absolute_error(y_test,yhat)
print('MAE: %.3f'%mae)

(339, 13) (339,)
(305, 13) (305,)
MAE: 3.209


# Minimun Covarience determinent

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.covariance import EllipticEnvelope

In [11]:
#import and read file

url = 'http://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = pd.read_csv(url,header=None)
data = df.values

#dependent and independent variables

X,y = data[:,:-1],data[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=1)

print(X_train.shape,y_train.shape)

#finding outliers 

ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)

#select all rows that are not outliers

mask = yhat != -1
X_train,y_train = X_train[mask,:],y_train[mask]

#shape of updated training sets

print(X_train.shape,y_train.shape)

#fit the model

model= LinearRegression()
model.fit(X_train,y_train)

yhat = model.predict(X_test)

#evaluate prediction

mae = mean_absolute_error(y_test,yhat)
print('MAE: %.3f'%mae)

(339, 13) (339,)
(335, 13) (335,)
MAE: 3.388


# local outlier factor lof

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import LocalOutlierFactor

In [14]:
#import and read file

url = 'http://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = pd.read_csv(url,header=None)
data = df.values

#dependent and independent variables

X,y = data[:,:-1],data[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=1)

print(X_train.shape,y_train.shape)

#finding outliers 

lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)

#select all rows that are not outliers

mask = yhat != -1
X_train,y_train = X_train[mask,:],y_train[mask]

#shape of updated training sets

print(X_train.shape,y_train.shape)

#fit the model

model= LinearRegression()
model.fit(X_train,y_train)

yhat = model.predict(X_test)

#evaluate prediction

mae = mean_absolute_error(y_test,yhat)
print('MAE: %.3f'%mae)

(339, 13) (339,)
(305, 13) (305,)
MAE: 3.356


# One-class SVM

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.svm import OneClassSVM

In [16]:
#import and read file

url = 'http://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = pd.read_csv(url,header=None)
data = df.values

#dependent and independent variables

X,y = data[:,:-1],data[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=1)

print(X_train.shape,y_train.shape)

#finding outliers 

oc = OneClassSVM(nu=0.01)
yhat = oc.fit_predict(X_train)

#select all rows that are not outliers

mask = yhat != -1
X_train,y_train = X_train[mask,:],y_train[mask]

#shape of updated training sets

print(X_train.shape,y_train.shape)

#fit the model

model= LinearRegression()
model.fit(X_train,y_train)

yhat = model.predict(X_test)

#evaluate prediction

mae = mean_absolute_error(y_test,yhat)
print('MAE: %.3f'%mae)

(339, 13) (339,)
(336, 13) (336,)
MAE: 3.431
