# Homework 4
### Author: Mariusz Słapek

### Needed libraries

In [18]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (RandomizedSearchCV,
                                     train_test_split)
from scipy.stats import uniform
from sklearn.svm import SVR


# First dataset - *Apartments*

### Read data

In [5]:
train = pd.read_csv('apartments.csv')
test = pd.read_csv('apartments_test.csv')

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
m2.price             1000 non-null int64
construction.year    1000 non-null int64
surface              1000 non-null int64
floor                1000 non-null int64
no.rooms             1000 non-null int64
district             1000 non-null object
dtypes: int64(5), object(1)
memory usage: 43.0+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 6 columns):
m2.price             9000 non-null int64
construction.year    9000 non-null int64
surface              9000 non-null int64
floor                9000 non-null int64
no.rooms             9000 non-null int64
district             9000 non-null object
dtypes: int64(5), object(1)
memory usage: 386.8+ KB


### One-hot encoding

In [10]:
one_hot = ce.OneHotEncoder(cols=['district'])
one_hot.fit(train)
train = one_hot.transform(train)
test = one_hot.transform(test)

### Divide data (X and y)

In [11]:
def divide_data(train, test):
    return train.drop(['m2.price'], axis=1), train['m2.price'], test.drop(['m2.price'], axis=1),test['m2.price']

### Metrics

In [12]:
from sklearn.metrics import explained_variance_score, max_error, mean_squared_error

def model_score(y_pred, y):
    print(f'explained_variance_score={explained_variance_score(y, y_pred)}')
    print(f'max_error={max_error(y, y_pred)}')
    print(f'mean_squared_error={mean_squared_error(y, y_pred)}')
    

### Standard Scaler

In [13]:
scaler = StandardScaler() 

nums = ['m2.price', 'construction.year', 'surface', 'floor', 'no.rooms']

train_scaled = train.copy()
test_scaled = test.copy()

train_scaled[nums] = scaler.fit_transform(train_scaled[nums])
test_scaled[nums] = scaler.fit_transform(test_scaled[nums])

train_scaled

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district_1,district_2,district_3,district_4,district_5,district_6,district_7,district_8,district_9,district_10
0,2.659324,-0.457926,-1.600545,-0.904974,-1.709248,1,0,0,0,0,0,0,0,0,0
1,-1.841700,1.052614,1.516542,1.165115,1.187782,0,1,0,0,0,0,0,0,0,0
2,0.172119,-1.077634,-0.781649,-1.595004,-0.984990,0,0,1,0,0,0,0,0,0,0
3,0.033083,1.168809,0.195742,0.475086,-0.260733,0,0,0,1,0,0,0,0,0,0
4,-0.523062,1.052614,1.542958,0.130071,1.187782,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3.164710,-1.697343,-1.098641,-1.249989,-0.984990,1,0,0,0,0,0,0,0,0,0
996,-0.071746,-1.697343,-0.992977,1.510130,-0.984990,0,0,0,0,0,0,0,0,1,0
997,-0.429268,0.587832,-0.015585,-0.904974,-0.260733,0,0,0,0,0,0,0,0,1,0
998,0.777920,-0.883975,-1.309969,0.475086,-1.709248,0,0,0,0,0,0,1,0,0,0


## SVM - without tuning

### Without scaling 

In *sklear.svm* we have to class: 
* SVC - for classification
* SVR - for regression

We make regression (m2.pize column) so we use SVR

In [23]:
X_train, y_train, X_test, y_test = divide_data(train, test)

In [24]:
svm_reg = sklearn.svm.SVR()
svm_reg.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

#### Evaluate model

In [25]:
y_pred = svm_reg.predict(X_test)

model_score(y_pred, y_test)

explained_variance_score=0.0003682903281065908
max_error=3292.6801724672737
mean_squared_error=826320.6518556494


The error is very big!

### With scaling

In [20]:
X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled = divide_data(train_scaled, test_scaled)

In [21]:
svm_reg = SVR()
svm_reg.fit(X_train_scaled, y_train_scaled)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

#### Evaluate

In [22]:
y_pred = svm_reg.predict(X_test_scaled)

model_score(y_pred, y_test_scaled)

explained_variance_score=0.9641331549899023
max_error=0.8593564825777225
mean_squared_error=0.035867396824281375


## Conclusion 1

Using scaling was a very good move. A *mean_squared_error* is smaller over 23 000 000 times. This research shows us how important is scalling, when you want use SVM.

### SVM - with hyperparameteres tuning
#### Random Search Method

##### Gauss kernel 

In [26]:
kernel = ['rbf']
cost = uniform(scale=10000)
gamma = uniform(scale=1)
degree = uniform(scale=1000)

params = {'kernel': kernel,
          'C': cost,
          'gamma': gamma,
          'degree': degree
}

In [27]:
rand_search_cv = RandomizedSearchCV(estimator=sklearn.svm.SVR(epsilon = 0.01),
                                   param_distributions=params,
                                   n_jobs=-1,
                                    n_iter=3000,
                                   verbose=2,
                                   cv=3)
result = rand_search_cv.fit(X_train, y_train)
best_params = result.best_params_

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:   55.7s
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | 

In [29]:
C = best_params['C']
degree = best_params['degree']
gamma = best_params['gamma']

### Without scalling

In [30]:
svm_reg = sklearn.svm.SVR(C = C, gamma = gamma, degree = degree)
svm_reg.fit(X_train, y_train)

SVR(C=9912.177704405041, cache_size=200, coef0=0.0, degree=379.4760225015611,
    epsilon=0.1, gamma=0.001382916073454421, kernel='rbf', max_iter=-1,
    shrinking=True, tol=0.001, verbose=False)

In [31]:
y_pred = svm_reg.predict(X_test)

model_score(y_pred, y_test)

explained_variance_score=0.7088702549466819
max_error=1922.757272300354
mean_squared_error=256676.3548358449


### With scalling

In [32]:
svm_reg = sklearn.svm.SVR(C = C, gamma = gamma, degree = degree)
svm_reg.fit(X_train_scaled, y_train_scaled)

SVR(C=9912.177704405041, cache_size=200, coef0=0.0, degree=379.4760225015611,
    epsilon=0.1, gamma=0.001382916073454421, kernel='rbf', max_iter=-1,
    shrinking=True, tol=0.001, verbose=False)

In [33]:
y_pred = svm_reg.predict(X_test_scaled)

model_score(y_pred, y_test_scaled)

explained_variance_score=0.9753696820500791
max_error=0.5144067186284033
mean_squared_error=0.02473366863995508


## Conclustion 2

Random Search method help us make noticeably better results (over three times!).

# Second dataset - *Energy efficiency*

Source: *https://data.world/uci/energy-efficiency*

*We perform energy analysis using 12 different building shapes simulated in Ecotect. The buildings differ with respect to the glazing area, the glazing area distribution, and the orientation, amongst other parameters. We simulate various settings as functions of the afore-mentioned characteristics to obtain 768 building shapes. The dataset comprises 768 samples and 8 features, aiming to predict two real valued responses. It can also be used as a multi-class classification problem if the response is rounded to the nearest integer.*

### Read data

In [34]:
data = pd.read_csv('ENB2012_data.csv')

In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
X1    768 non-null float64
X2    768 non-null float64
X3    768 non-null float64
X4    768 non-null float64
X5    768 non-null float64
X6    768 non-null int64
X7    768 non-null float64
X8    768 non-null int64
Y1    768 non-null float64
Y2    768 non-null float64
dtypes: float64(8), int64(2)
memory usage: 60.1 KB


In [36]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


### Divide into train and test sets

In [37]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['Y1', 'Y2'], axis=1), data[['Y1']])

### Scalling data

In [47]:
scaler = StandardScaler() 

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
y_train_scaled = y_train.copy()
y_test_scaled = y_test.copy()


nums = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']

X_train_scaled[nums] = scaler.fit_transform(X_train_scaled[nums])
X_test_scaled[nums] = scaler.fit_transform(X_test_scaled[nums])
y_train_scaled = pd.DataFrame(scaler.fit_transform(y_train_scaled), columns=y_train_scaled.columns)
y_test_scaled = pd.DataFrame(scaler.fit_transform(y_test_scaled), columns=y_test_scaled.columns)

X_train_scaled

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8
380,-1.356977,1.543082,1.128909,0.978888,-1.006969,-1.346942,0.083637,-0.572320
60,0.503798,-0.650846,0.000979,-0.641341,0.993080,-1.346942,-1.054356,-1.219974
100,1.248108,-1.199328,0.000979,-1.181417,0.993080,-1.346942,-1.054356,-0.572320
575,-1.356977,1.543082,1.128909,0.978888,-1.006969,1.343828,1.221631,-1.219974
384,1.992418,-1.747810,-0.562986,-1.451455,0.993080,-1.346942,0.083637,0.075335
...,...,...,...,...,...,...,...,...
138,-1.170899,1.268841,0.564944,0.978888,-1.006969,0.446904,-1.054356,-0.572320
125,-0.519628,0.446118,-1.126951,0.978888,-1.006969,-0.450019,-1.054356,-0.572320
570,-1.170899,1.268841,0.564944,0.978888,-1.006969,0.446904,1.221631,-1.219974
75,-0.240512,0.171877,-1.690916,0.978888,-1.006969,1.343828,-1.054356,-1.219974


## SVM - without tuning

### Without scalling

In [40]:
svm_reg = sklearn.svm.SVR()
svm_reg.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [41]:
y_pred = svm_reg.predict(X_test)

model_score(y_pred, y_test)

explained_variance_score=0.7192408814753468
max_error=16.26897133016716
mean_squared_error=29.508220993983954


### With scalling

In [50]:
svm_reg = sklearn.svm.SVR()
svm_reg.fit(X_train_scaled, y_train_scaled)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [51]:
y_pred = svm_reg.predict(X_test_scaled)

model_score(y_pred, y_test_scaled)

explained_variance_score=0.9547586888664563
max_error=0.7065076504683079
mean_squared_error=0.045405214496254


## SVM - with hyperparameteres tuning
#### Random Search Method

In [53]:
rand_search_cv = RandomizedSearchCV(estimator=sklearn.svm.SVR(epsilon = 0.01),
                                   param_distributions=params,
                                   n_jobs=-1,
                                    n_iter=500,
                                   verbose=2,
                                   cv=3)
result = rand_search_cv.fit(X_train, y_train)
best_params = result.best_params_

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  5.3min finished


In [54]:
C = best_params['C']
degree = best_params['degree']
gamma = best_params['gamma']

### Without scalling

In [55]:
svm_reg = sklearn.svm.SVR(C = C, gamma = gamma, degree = degree)
svm_reg.fit(X_train, y_train)
y_pred = svm_reg.predict(X_test)

model_score(y_pred, y_test)

explained_variance_score=0.9962285406459845
max_error=3.1294842434118095
mean_squared_error=0.38751251786556623


### With scalling

In [56]:
svm_reg = sklearn.svm.SVR(C = C, gamma = gamma, degree = degree)
svm_reg.fit(X_train_scaled, y_train_scaled)

y_pred = svm_reg.predict(X_test_scaled)

model_score(y_pred, y_test_scaled)

explained_variance_score=0.9688113224548751
max_error=0.4954438003905063
mean_squared_error=0.0313485403887


## Conclusions

The results are the same as in the first dataframe:
- scalling is a good tools to make model better
- what is obvious hyperparameteres tuning make that model is better