In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn import preprocessing

## Zbiór apartments

In [3]:
apartments = pd.read_csv("~/Documents/github/apartments.csv")

In [4]:
apartments

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district
0,5897,1953,25,3,1,Srodmiescie
1,1818,1992,143,9,5,Bielany
2,3643,1937,56,1,2,Praga
3,3517,1995,93,7,3,Ochota
4,3013,1992,144,6,5,Mokotow
...,...,...,...,...,...,...
995,6355,1921,44,2,2,Srodmiescie
996,3422,1921,48,10,2,Bemowo
997,3098,1980,85,3,3,Bemowo
998,4192,1942,36,7,1,Zoliborz


Zastosujemy label encoding dla zmiennej *district*:

In [5]:
le = preprocessing.LabelEncoder()
le.fit(apartments.loc[:,"district"])

LabelEncoder()

In [6]:
apartments.loc[:,"district_enc"] = le.transform(apartments.loc[:,"district"])

In [7]:
X = apartments[['construction.year','surface','floor','no.rooms', 'district_enc']]
Y = apartments['m2.price']
X.head(10)

Unnamed: 0,construction.year,surface,floor,no.rooms,district_enc
0,1953,25,3,1,5
1,1992,143,9,5,1
2,1937,56,1,2,4
3,1995,93,7,3,3
4,1992,144,6,5,2
5,1926,61,6,2,5
6,1970,127,8,5,2
7,1985,105,8,4,6
8,1928,145,6,6,5
9,1949,112,9,4,5


In [8]:
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [9]:
from sklearn.svm import SVR
clf = SVR()
clf.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [10]:
y_pred = clf.predict(x_test)

In [11]:
rmse = 0 
for i in range(200):
    rmse = rmse + (y_test.iloc[i] - y_pred[i])**2
rmse/=200
np.sqrt(rmse)

954.0734166653851

Szukanie najlepszych parametrów:

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

clf = SVR()
distributions = dict(C=uniform(loc=0.01, scale=3),
                     gamma=uniform(loc=0.1, scale=0.5),
                     degree=[2, 3, 4, 5])
searcher = RandomizedSearchCV(clf, distributions, random_state=0)
search = searcher.fit(x_train, y_train)
search.best_params_

{'C': 1.1883543883024892, 'degree': 3, 'gamma': 0.13551802909894348}

In [13]:
best_clf = SVR(C=1.18835, degree=3, gamma=0.1355)
best_clf.fit(x_train, y_train)

SVR(C=1.18835, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1355,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [14]:
y_pred = best_clf.predict(x_test)

Błąd średniokwadratowy:

In [15]:
rmse = 0 
for i in range(200):
    rmse = rmse + (y_test.iloc[i] - y_pred[i])**2
rmse/=200
np.sqrt(rmse)

954.1007679035671

Porównywalnie do poprzedniego wyniku, możemy wywnioskować, że algorytm dobrze dobiera paramety.

## Zbiór stock

Zbiór z OpenMLa zawierający informację o dziennych cenach akcji 10 firm zajmujących się branżą kosmiczną od stycznia 1988 do października 1991. Jako zmienną celu autor wskazał ceny akcji ostatniej z kolei firmy. 

In [16]:
data = pd.read_csv("~/Documents/github/stock.csv")

In [17]:
data

Unnamed: 0,company1,company2,company3,company4,company5,company6,company7,company8,company9,company10
0,17.219,50.500,18.750,43.000,60.875,26.375,67.750,19.000,48.750,34.875
1,17.891,51.375,19.625,44.000,62.000,26.125,68.125,19.125,48.750,35.625
2,18.438,50.875,19.875,43.875,61.875,27.250,68.500,18.250,49.000,36.375
3,18.672,51.500,20.000,44.000,62.625,27.875,69.375,18.375,49.625,36.250
4,17.438,49.000,20.000,41.375,59.750,25.875,63.250,16.500,47.500,35.500
...,...,...,...,...,...,...,...,...,...,...
945,50.375,46.250,19.375,52.250,61.875,23.500,78.625,26.625,41.875,44.375
946,50.750,46.375,19.625,50.875,64.625,23.250,77.625,26.500,40.750,45.000
947,50.625,46.625,19.625,50.875,64.625,23.250,75.000,26.250,41.250,44.125
948,50.125,47.000,19.875,50.750,62.750,22.875,74.500,25.250,40.625,43.875


In [18]:
X = data[['company1','company2','company3','company4','company5','company6','company7','company8','company9']]
Y = data['company10']

In [19]:
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [20]:
clf = SVR()
clf.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [21]:
y_pred = clf.predict(x_test)

Błąd średniokwadratowy:

In [22]:
rmse = 0 
for i in range(len(y_pred)):
    rmse = rmse + (y_test.iloc[i] - y_pred[i])**2
rmse/=len(y_pred)
np.sqrt(rmse)

1.662568689199444

Szukanie najlepszych parametrów:

In [23]:
clf = SVR()
distributions = dict(C=uniform(loc=0.01, scale=3),
                     gamma=uniform(loc=0.1, scale=0.5),
                     degree=[2, 3, 4, 5])
searcher = RandomizedSearchCV(clf, distributions, random_state=0)
search = searcher.fit(x_train, y_train)
search.best_params_

{'C': 1.5714324386536145, 'degree': 5, 'gamma': 0.15913721293446662}

In [24]:
best_clf = SVR(C=1.57143, degree=5, gamma=0.1591)
best_clf.fit(x_train, y_train)

SVR(C=1.57143, cache_size=200, coef0=0.0, degree=5, epsilon=0.1, gamma=0.1591,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [25]:
y_pred = best_clf.predict(x_test)

In [26]:
rmse = 0 
for i in range(len(y_pred)):
    rmse = rmse + (y_test.iloc[i] - y_pred[i])**2
rmse/=len(y_pred)
np.sqrt(rmse)

3.371776914725569

Tutaj wynik jest zauważalnie gorszy, algorytm dobrał hiperparametry lepiej niż random search.