In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVR,SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import category_encoders as ce
import matplotlib as plt
from scipy.stats import uniform

Zbiór apartments
====

In [2]:
data = pd.read_csv('apartments.cvs')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Unnamed: 0           1000 non-null int64
m2.price             1000 non-null int64
construction.year    1000 non-null int64
surface              1000 non-null int64
floor                1000 non-null int64
no.rooms             1000 non-null int64
district             1000 non-null object
dtypes: int64(6), object(1)
memory usage: 54.8+ KB


In [4]:
# Jako zmienną celu potraktuję zmienną m2.price toworząc w ten sposób zadanie Regresi

# kolumna pierwsza to indeksy można usunąć 
data = data.drop('Unnamed: 0',axis=1)


In [5]:
# Podziła na zbiór testowy/ternignowy 
X_train, X_test, y_train, y_test = train_test_split(data.drop('m2.price', axis=1), data['m2.price'])

In [6]:
#Użyję One-hote encodingu 
data.head()

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district
0,5897,1953,25,3,1,Srodmiescie
1,1818,1992,143,9,5,Bielany
2,3643,1937,56,1,2,Praga
3,3517,1995,93,7,3,Ochota
4,3013,1992,144,6,5,Mokotow


In [9]:

# SVM bez skalowania 


svm_ = SVR()
ohe = ce.OneHotEncoder()

# OHE
svm_.fit(ohe.fit_transform(X_train),y_train)
y_pred_ohe = svm_.predict(ohe.transform(X_test))



print(f'RMSE : {mean_squared_error(y_test,y_pred_ohe,squared=False)}')
print(f'R2 score : {r2_score(y_test,y_pred_ohe)}')




RMSE : 882.0373554174482
R2 score : -0.004578174264286972


Bez skalowania wynik tregiczne gorsze niż podawanie stałej wartosci dla każdego przypadku.

In [10]:
# Strojenie parametrów bez skalowanie 
from scipy.stats import uniform

param_distribution = {
    'gamma': uniform(),
    'C':uniform(0,10000),
    
    
}
rps_no = RandomizedSearchCV(SVR(epsilon=0.1),param_distributions=param_distribution,cv=5,n_iter=1000,verbose=6,n_jobs=-1)
rps_no.fit(ohe.fit_transform(X_train),y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 178 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 578 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 852 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 1178 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done 1978 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 2452 tasks      | elapsed:   52.6s
[Parallel(n_jobs=-1)]: Done 2978 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3552 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 4178 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 4852 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 4985 out of 5000 | elapsed:  1.8min remaining:    0.3s
[Parallel(n_jobs=-1)]: Done

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='scale', kernel='rbf',
                                 max_iter=-1, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='deprecated', n_iter=1000, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0725181e10>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0723641fd0>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=6)

In [14]:
best_no = rps_no.best_estimator_
rps_no.best_score_

0.5375722544386464

In [15]:
#
scal = StandardScaler(copy=False)

svm_ = SVR()
ohe = ce.OneHotEncoder()

columns_to_sclae =X_train.columns[:-1]

scaled_X_train = X_train.copy()
scaled_X_test = X_test.copy()

scaled_X_train[columns_to_sclae] = scal.fit_transform(scaled_X_train[columns_to_sclae])
scaled_X_test[columns_to_sclae] = scal.transform(scaled_X_test[columns_to_sclae])

# OHE
svm_.fit(ohe.fit_transform(scaled_X_train),y_train)
y_pred_ohe = svm_.predict(ohe.transform(scaled_X_test))



print(f'RMSE_scaled : {mean_squared_error(y_test,y_pred_ohe,squared=False)}')
print(f'R2 score_scaled : {r2_score(y_test,y_pred_ohe)}')

RMSE_scaled : 860.4664138232931
R2 score_scaled : 0.04395655490587591


Po przeskalowaniu wynik jest wyraznie lepszy choć nie zbyt zadowalający. Może pomoże dobranie własciwych parametrów.

In [16]:
from scipy.stats import uniform

param_distribution = {
    'gamma': uniform(),
    'C':uniform(0,10000),
    
    
}
rps = RandomizedSearchCV(SVR(epsilon=0.1),param_distributions=param_distribution,cv=5,n_iter=1000,verbose=6,n_jobs=-1)
# Poniważ używam one hota nie ma potrzeby użyć piplinu z kodowaniem


In [17]:
rps.fit(ohe.fit_transform(scaled_X_train),y_train)


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:   45.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   59.0s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elap

0.9737447991417044

In [18]:
print(f'Best score: {rps.best_score_}')
rps.best_params_ 
# 



Best score: 0.9737447991417044


{'C': 9434.121037971136, 'gamma': 0.014886955084316145}

In [21]:
# Porówanie wyników dla skalowania i bez skalowania 
ohe =ce.OneHotEncoder()
svm_best=rps.best_estimator_
svm_best.fit(ohe.fit_transform(scaled_X_train),y_train)
y_pred_ohe_s = svm_best.predict(ohe.transform(scaled_X_test))
ohe =ce.OneHotEncoder()
best_no.fit(ohe.fit_transform(X_train),y_train)
y_ped_ohe_no = best_no.predict(ohe.transform(X_test))



dane = {'Skalowane' : [mean_squared_error(y_test,y_pred_ohe,squared=False),r2_score(y_test,y_pred_ohe)],
        'Nie skalowane': [mean_squared_error(y_test,y_ped_ohe_no,squared=False),r2_score(y_test,y_ped_ohe_no)]}
pd.DataFrame(data=dane,columns=['Skalowane','Nie skalowane'],index=['RMSE','R2'])



Unnamed: 0,Skalowane,Nie skalowane
RMSE,154.652057,549.792143
R2,0.969117,0.609693


Wyraznie widać róznicę wynikającą tylko ze sklaowania danych (ewentualnie pechowe działanie RandomSearch ale nie powinien to być problem dla 1000 iteracij) zgodnie z artykułem skalowanie danych znacząco poprawia działanie SVM.

Drugi zbiór 
====
Z biblioteki OpenMl https://www.openml.org/d/1462. Zadanie polega na klasyfikacij binarnej. Klasyfikujemy czy dana soba ma cukrzycę czy nie.

In [23]:
data2 = pd.read_csv('dataset_37_diabetes.csv')
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
insu     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null object
dtypes: float64(2), int64(6), object(1)
memory usage: 54.1+ KB


In [24]:

#Żadna z danych nie jest bardzo skośna (nie potrzeba dodatkowych przekształceń)
from collections import Counter

print(Counter(data2['class']))
# Klasy nie są idelanie zbalansowane ale miary takie jak acc powinny mieć sens 
data2.skew()

Counter({'tested_negative': 500, 'tested_positive': 268})


preg    0.901674
plas    0.173754
pres   -1.843608
skin    0.109372
insu    2.272251
mass   -0.428982
pedi    1.919911
age     1.129597
dtype: float64

In [25]:
# Trzeba poprawić labele 
data2['class'] = np.where(data2['class']=='tested_negative',0,1)

In [26]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(data2.drop('class', axis=1), data2['class'])

In [27]:
# skalowanie 
scal = StandardScaler(copy=False)

X_train2 = pd.DataFrame(scal.fit_transform(X_train2))
X_test2 = pd.DataFrame(scal.transform(X_test2))

In [28]:
# test bez CV i z podstawowymi parametrami 
svm_ = SVC(probability=True)
# Tym razem nie ma potrzeby kodowania 
svm_.fit(X_train2,y_train2)

from sklearn.metrics import accuracy_score,auc,roc_curve,f1_score,recall_score


y_pred = svm_.predict(X_test2)





print(f'Accuracy : {accuracy_score(y_test2,y_pred)}')
print(f'F1_score : {f1_score(y_test2,y_pred)}')
print(f'Reccal : {recall_score(y_test2,y_pred)}')

Accuracy : 0.78125
F1_score : 0.6557377049180328
Reccal : 0.5555555555555556


Wyniki zwłaszcza reccal nie są zbyt dobre.

In [36]:
param_distribution = {
    
    'gamma': uniform(),
    
    'C':uniform(0,10000),
    
}
rps_c = RandomizedSearchCV(SVC(),param_distributions=param_distribution,cv=5,n_jobs=-1,n_iter=1000,verbose=6)

In [37]:
rps_c.fit(X_train2,y_train2)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 308 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 1108 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 1656 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 2308 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 3056 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 3908 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done 4856 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   47.1s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=1000, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f073f575550>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f073f7b6790>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=6)

In [14]:
print(rps_c.best_score_)
# Najlepszy wynik nie różni się znacznie od parametrów domyślnych
rps_c.best_params_
# Może to oznaczać zły kernel albo duży błąd treningowy sprawdżmy

0.7656071964017991


{'C': 2.6180354960303465, 'gamma': 0.1912181329001693}

In [12]:
svm_c_best = rps_c.best_estimator_

svm_c_best.fit(X_train2,y_train2)



y_pred = svm_c_best.predict(X_train2)





print(f'Accuracy train : {accuracy_score(y_train2,y_pred)}')
# Zgodnie z przewidywaniami algorytm po prostu nie dopasowuję wystarczająco się do danych sprawdzę jescze inne jądra 


Accuracy train : 0.8038194444444444


In [31]:
svm_c = SVC(kernel='linear')

svm_c.fit(X_train2,y_train2)
y_pred = svm_c.predict(X_test2)

print(f'Accuracy linear : {accuracy_score(y_test2,y_pred)}')
#Wynik podobny do jądra gausowskiego 
# Strojenie dla jądra linear 

param_distribution = {
    
    'C':uniform(0,100)    
}
rps_linear = RandomizedSearchCV(SVC(kernel='linear'),param_distributions=param_distribution,cv=5,n_jobs=-1,n_iter=100,verbose=6)
rps_linear.fit(X_train2,y_train2)

Accuracy linear : 0.7760416666666666
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   23.5s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='linear', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=100, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0740425450>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=6)

In [32]:
print(f'Best Score :{rps_linear.best_score_}')
# Wynik podobny jak w przypadku jądraa gausowiskiego 

Best Score :0.7726086956521738


In [33]:
svm_c = SVC(kernel='poly')

svm_c.fit(X_train2,y_train2)
y_pred = svm_c.predict(X_test2)

print(f'Accuracy poly : {accuracy_score(y_test2,y_pred)}')
# Nieco niższy niż w pozostałych przypadkach 
# Strojenie dla jądra poly 

param_distribution = {
    'gamma':uniform(),
    'C':uniform(0,100),
    'degree':[i for i in range(1,11)]
}
rps_poly = RandomizedSearchCV(SVC(kernel='poly'),param_distributions=param_distribution,cv=5,n_jobs=-1,n_iter=100,verbose=6)
rps_poly.fit(X_train2,y_train2)

Accuracy poly : 0.7239583333333334
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 263 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 379 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.1min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='poly', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=100, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f07403fa210>,
                                        'degree': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10],
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f07403877d0>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                

In [34]:
# Strojenie parametrów trwało znacznie dłużej niż dla innych jąder
rps_poly.best_score_
# Wynik podobny jak w poprzednich przypadkach

0.7726086956521738

In [38]:
svm_c_best = rps_c.best_estimator_
svm_linear_best = rps_linear.best_estimator_
svm_poly_best = rps_poly.best_estimator_

svm_c_best.fit(X_train2,y_train2)
svm_linear_best.fit(X_train2,y_train2)
svm_poly_best.fit(X_train2,y_train2)

y_pred_gaus = svm_c_best.predict(X_test2)
y_pred_linear = svm_linear_best.predict(X_test2)
y_pred_poly = svm_poly_best.predict(X_test2)


data = {'RBF':[accuracy_score(y_test2,y_pred_gaus),f1_score(y_test2,y_pred_gaus),recall_score(y_test2,y_pred_gaus)],
        'Linear':[accuracy_score(y_test2,y_pred_linear),f1_score(y_test2,y_pred_linear),recall_score(y_test2,y_pred_linear)],
        'Poly':[accuracy_score(y_test2,y_pred_poly),f1_score(y_test2,y_pred_poly),recall_score(y_test2,y_pred_poly)]}


pd.DataFrame(data=data,columns=['RBF','Linear','Poly'],index=['Accuracy','F1','Recall'])

Unnamed: 0,RBF,Linear,Poly
Accuracy,0.786458,0.770833,0.765625
F1,0.643478,0.627119,0.621849
Recall,0.513889,0.513889,0.513889


Wszystkie jądra uzyskły bardzo zbliżone i nie zbyt wysokie wyniki jak wspominałem wcześniej może to wynikać z trudności zbioru. SVM uzyskuje też duży bład treningowy zbliżony do testowego co prawdopodobnie oznacza ,że jest to model zbyt prosty dla wybranego zbioru. 