In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVR,SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import category_encoders as ce
import matplotlib as plt
from scipy.stats import uniform

Zbiór apartments
====

In [33]:
data = pd.read_csv('apartments.cvs')

In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Unnamed: 0           1000 non-null int64
m2.price             1000 non-null int64
construction.year    1000 non-null int64
surface              1000 non-null int64
floor                1000 non-null int64
no.rooms             1000 non-null int64
district             1000 non-null object
dtypes: int64(6), object(1)
memory usage: 54.8+ KB


In [36]:
# Jako zmienną celu potraktuję zmienną m2.price toworząc w ten sposób zadanie Regresi

# kolumna pierwsza to indeksy można usunąć 
data = data.drop('Unnamed: 0',axis=1)


In [37]:
# Podziła na zbiór testowy/ternignowy 
X_train, X_test, y_train, y_test = train_test_split(data.drop('m2.price', axis=1), data['m2.price'])

In [38]:
#Użyję One-hote encodingu ponieważ Target nie da się w prosty sposób zastosować dla regresi
data.head()

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district
0,5897,1953,25,3,1,Srodmiescie
1,1818,1992,143,9,5,Bielany
2,3643,1937,56,1,2,Praga
3,3517,1995,93,7,3,Ochota
4,3013,1992,144,6,5,Mokotow


In [39]:

# SVM bez skalowania 


svm_ = SVR()
ohe = ce.OneHotEncoder()

# OHE
svm_.fit(ohe.fit_transform(X_train),y_train)
y_pred_ohe = svm_.predict(ohe.transform(X_test))



print(f'RMSE : {mean_squared_error(y_test,y_pred_ohe,squared=False)}')
print(f'R2 score : {r2_score(y_test,y_pred_ohe)}')




RMSE : 892.2402763012336
R2 score : -0.03131450175057626


Bez skalowania wynik tregiczne gorsze niż podawanie stałej wartosci dla każdego przypadku.

In [40]:
#
scal = StandardScaler(copy=False)

svm_ = SVR()
ohe = ce.OneHotEncoder()

columns_to_sclae =X_train.columns[:-1]

scaled_X_train = X_train.copy()
scaled_X_test = X_test.copy()

scaled_X_train[columns_to_sclae] = scal.fit_transform(scaled_X_train[columns_to_sclae])
scaled_X_test[columns_to_sclae] = scal.transform(scaled_X_test[columns_to_sclae])

# OHE
svm_.fit(ohe.fit_transform(scaled_X_train),y_train)
y_pred_ohe = svm_.predict(ohe.transform(scaled_X_test))



print(f'RMSE_scaled : {mean_squared_error(y_test,y_pred_ohe,squared=False)}')
print(f'R2 score_scaled : {r2_score(y_test,y_pred_ohe)}')

RMSE_scaled : 871.725689830553
R2 score_scaled : 0.015564725670756063


Po przeskalowaniu wynik jest wyraznie lepszy choć nie zbyt zadowalający. Może pomoże dobranie własciwych parametrów.

In [43]:
from scipy.stats import uniform

param_distribution = {
    'gamma': uniform(),
    'C':uniform(0,10000),
    
    
}
rps = RandomizedSearchCV(SVR(epsilon=0.1),param_distributions=param_distribution,cv=5,n_iter=1000,verbose=6,n_jobs=-1)
# Poniważ używam one hota nie ma potrzeby użyć piplinu z kodowaniem


Unnamed: 0,construction.year,surface,floor,no.rooms,district
961,0.221048,-1.613005,-0.570362,-1.709880,Srodmiescie
720,-0.982549,0.288003,-0.570362,-0.255900,Mokotow
600,1.075214,0.552032,1.481301,1.198079,Srodmiescie
416,0.026919,1.449730,-0.228419,1.925069,Bielany
469,0.492828,0.155988,1.139357,0.471089,Mokotow
...,...,...,...,...,...
383,1.385819,-1.243365,1.139357,-0.982890,Wola
94,0.221048,-1.111350,-1.596194,-0.982890,Bemowo
3,1.152865,0.208794,0.455469,-0.255900,Ochota
447,0.065745,-1.586602,0.455469,-1.709880,Praga


In [44]:
rps.fit(ohe.fit_transform(scaled_X_train),y_train)


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1497 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2097 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2797 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elap

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='scale', kernel='rbf',
                                 max_iter=-1, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='deprecated', n_iter=1000, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd3cc8e0b10>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd3ad04ccd0>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=6)

In [45]:
print(f'Best score: {rps.best_score_}')
rps.best_params_ 
# 



Best score: 0.9735558162852076


{'C': 6233.262130988943, 'gamma': 0.017704464980319745}

In [46]:

svm_best=rps.best_estimator_
svm_best.fit(ohe.fit_transform(scaled_X_train),y_train)
y_pred_ohe = svm_best.predict(ohe.transform(scaled_X_test))



print(f'RMSE : {mean_squared_error(y_test,y_pred_ohe,squared=False)}')
print(f'R2 score : {r2_score(y_test,y_pred_ohe)}')

# Wynik znacznie (wielokrotnie) wyższy niż z podstawowymi parametrami 
# Strojenie parametrów bardzo pomogło 

RMSE : 153.16137451872436
R2 score : 0.9696103297560396


Drugi zbiór 
====
Z biblioteki OpenMl https://www.openml.org/d/1462. Zadanie polega na klasyfikacij binarnej. Klasyfikujemy czy dana soba ma cukrzycę czy nie.

In [2]:
data2 = pd.read_csv('dataset_37_diabetes.csv')
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
preg     768 non-null int64
plas     768 non-null int64
pres     768 non-null int64
skin     768 non-null int64
insu     768 non-null int64
mass     768 non-null float64
pedi     768 non-null float64
age      768 non-null int64
class    768 non-null object
dtypes: float64(2), int64(6), object(1)
memory usage: 54.1+ KB


In [3]:

#Żadna z danych nie jest bardzo skośna (nie potrzeba dodatkowych przekształceń)
from collections import Counter

print(Counter(data2['class']))
# Klasy nie są idelanie zbalansowane ale miary takie jak acc powinny mieć sens 
data2.skew()

Counter({'tested_negative': 500, 'tested_positive': 268})


preg    0.901674
plas    0.173754
pres   -1.843608
skin    0.109372
insu    2.272251
mass   -0.428982
pedi    1.919911
age     1.129597
dtype: float64

In [4]:
# Trzeba poprawić labele 
data2['class'] = np.where(data2['class']=='tested_negative',0,1)

In [5]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(data2.drop('class', axis=1), data2['class'])

In [6]:
# skalowanie 
scal = StandardScaler(copy=False)

X_train2 = pd.DataFrame(scal.fit_transform(X_train2))
X_test2 = pd.DataFrame(scal.transform(X_test2))

In [7]:
# test bez CV i z podstawowymi parametrami 
svm_ = SVC(probability=True)
# Tym razem nie ma potrzeby kodowania 
svm_.fit(X_train2,y_train2)

from sklearn.metrics import accuracy_score,auc,roc_curve,f1_score,recall_score


y_pred = svm_.predict(X_test2)





print(f'Accuracy : {accuracy_score(y_test2,y_pred)}')
print(f'F1_score : {f1_score(y_test2,y_pred)}')
print(f'Reccal : {recall_score(y_test2,y_pred)}')

Accuracy : 0.7447916666666666
F1_score : 0.588235294117647
Reccal : 0.5147058823529411


Wyniki zwłaszcza reccal nie są zbyt dobre.

In [12]:
param_distribution = {
    
    'gamma': uniform(),
    
    'C':uniform(0,10000),
    
}
rps_c = RandomizedSearchCV(SVC(),param_distributions=param_distribution,cv=5,n_jobs=-1,n_iter=1000,verbose=6)

In [13]:
rps_c.fit(X_train2,y_train2)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 832 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 2832 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 3324 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 4072 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 4924 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   39.1s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=1000, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f5502abd8d0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f55031a5810>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=6)

In [14]:
print(rps_c.best_score_)
# Najlepszy wynik nie różni się znacznie od parametrów domyślnych
rps_c.best_params_
# Może to oznaczać zły kernel albo duży błąd treningowy sprawdżmy

0.7656071964017991


{'C': 2.6180354960303465, 'gamma': 0.1912181329001693}

In [12]:
svm_c_best = rps_c.best_estimator_

svm_c_best.fit(X_train2,y_train2)



y_pred = svm_c_best.predict(X_train2)





print(f'Accuracy train : {accuracy_score(y_train2,y_pred)}')
# Zgodnie z przewidywaniami algorytm po prostu nie dopasowuję wystarczająco się do danych sprawdzę jescze inne jądra 


Accuracy train : 0.8038194444444444


In [10]:
svm_c = SVC(kernel='linear')

svm_c.fit(X_train2,y_train2)
y_pred = svm_c.predict(X_test2)

print(f'Accuracy linear : {accuracy_score(y_test2,y_pred)}')
#Wynik podobny do jądra gausowskiego 

Accuracy linear : 0.7552083333333334


In [11]:
svm_c = SVC(kernel='poly')

svm_c.fit(X_train2,y_train2)
y_pred = svm_c.predict(X_test2)

print(f'Accuracy poly : {accuracy_score(y_test2,y_pred)}')
# Nieco niższy niż w pozostałych przypadkach 

Accuracy poly : 0.7239583333333334


In [15]:
svm_c_best = rps_c.best_estimator_

svm_c_best.fit(X_train2,y_train2)



y_pred = svm_c_best.predict(X_test2)





print(f'Accuracy best : {accuracy_score(y_test2,y_pred)}')
print(f'F1_score best : {f1_score(y_test2,y_pred)}')
print(f'Reccal best : {recall_score(y_test2,y_pred)}')
# Zbiór okazał się dośc trudny dla SVM 

Accuracy best : 0.7604166666666666
F1_score best : 0.6101694915254238
Reccal best : 0.5625
