In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import SVR, SVC

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

## Apartments

In [2]:
data = pd.read_csv("./apartments.csv", index_col=0)

In [3]:
column_transformer = ColumnTransformer([('OneHot', OneHotEncoder(), ['district'])], remainder= 'passthrough')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['m2.price'], axis = 1), data['m2.price'])

In [5]:
model_scaling = Pipeline([('transformer', column_transformer),
                          ('scaler', StandardScaler()), 
                          ('regressor', SVR())])

In [6]:
model_no_scaling = Pipeline([('transformer', column_transformer),
                             ('regressor', SVR())])

In [7]:
param_grid = {
    'regressor__C': np.logspace(0, 3, num=5),
    'regressor__gamma': np.geomspace(10e-5, 1, num=5)
}

In [8]:
search_scaling = RandomizedSearchCV(
    model_scaling,
    param_grid,
    scoring='neg_root_mean_squared_error').fit(X_train, y_train)

y_pred = search_scaling.best_estimator_.predict(X_test)
RMSE_scaling = mean_squared_error(y_test, y_pred, squared=False)

In [9]:
search_no_scaling = RandomizedSearchCV(
    model_no_scaling,
    param_grid,
    scoring='neg_root_mean_squared_error').fit(X_train, y_train)

y_pred = search_no_scaling.best_estimator_.predict(X_test)
RMSE_no_scaling = mean_squared_error(y_test, y_pred, squared=False)

In [10]:
print(f'RMSE with scaling:    {round(RMSE_scaling, 2)}')
print(f'RMSE without scaling: {round(RMSE_no_scaling, 2)}')

RMSE with scaling:    145.67
RMSE without scaling: 547.52


As we can see, results with scaling are much better than without scaling.

## Australia

In [23]:
data = pd.read_csv("./australia.csv", nrows=1000)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['RainTomorrow'], axis = 1), data['RainTomorrow'])

In [25]:
model_scaling = Pipeline([('scaler', StandardScaler()), 
                          ('classifier', SVC(probability=True))])

In [26]:
model_no_scaling = Pipeline([('classifier', SVC(probability=True))])

In [27]:
param_grid = {
    'classifier__C': np.logspace(0, 3, num=10),
    'classifier__gamma': np.geomspace(10e-5, 1, num=10)
}

In [28]:
search_scaling = RandomizedSearchCV(
    model_scaling,
    param_grid,
    scoring='roc_auc').fit(X_train, y_train)

y_pred = search_scaling.best_estimator_.predict_proba(X_test)
AUC_scaling = roc_auc_score(y_test, y_pred[:, 1])

In [29]:
search_no_scaling = RandomizedSearchCV(
    model_no_scaling,
    param_grid,
    scoring='roc_auc').fit(X_train, y_train)

y_pred = search_no_scaling.best_estimator_.predict_proba(X_test)
AUC_no_scaling = roc_auc_score(y_test, y_pred[:, 1])

In [30]:
print(f'AUC with scaling:    {round(AUC_scaling, 2)}')
print(f'AUC without scaling: {round(AUC_no_scaling, 2)}')

AUC with scaling:    0.91
AUC without scaling: 0.89


Similarly to regression, classification problem using Support Vector Machines benefits from scaling values before modelling.