In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline

np.random.seed(123)

Wczytanie danych

In [2]:
df = pd.read_csv('../../Grupa1/apartments.csv')
df_test = pd.read_csv('../../Grupa1/apartments_test.csv')
df = df.append(df_test)

In [3]:
df.head()

Unnamed: 0,m2.price,construction.year,surface,floor,no.rooms,district
0,5897,1953,25,3,1,Srodmiescie
1,1818,1992,143,9,5,Bielany
2,3643,1937,56,1,2,Praga
3,3517,1995,93,7,3,Ochota
4,3013,1992,144,6,5,Mokotow


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['district'], axis = 1),
                                                    df['district'], test_size = 0.3)

In [5]:
model = SVC()

In [7]:
model.fit(X_train, y_train)
y = model.predict(X_test)
accuracy_score(y_test, y)

0.237

Słabo. Spóbujmy dostroić parametry:

In [8]:
skl = StandardScaler()

X_train_skl = skl.fit_transform(X_train)
X_test_skl = skl.transform(X_test)

In [9]:
model = SVC()
model.fit(X_train_skl, y_train)
y = model.predict(X_test_skl)
accuracy_score(y_test, y)

0.30566666666666664

Przeskalowanie pomogło, teraz dostroimy parametry:

In [10]:
model = SVC()
params = {
    'C': np.arange(0.1, 10, 0.5),
    'degree': np.arange(1, 6, 1),
    'gamma': ["scale", "auto"] + np.arange(0.01, 0.5, 0.1).tolist()
}

grid = RandomizedSearchCV(model, params, cv=5)
grid.fit(X_train_skl, y_train)

print("Score: {}".format(grid.best_score_))
grid.best_params_

Score: 0.3008571428571428


{'gamma': 0.01, 'degree': 1, 'C': 9.1}

Minimalnie gorzej.

Zobaczmy jeszcze inne jądra:
* ‘linear’
* 'poly'
* 'sigmoid'

In [12]:
model = SVC(kernel='linear')
params = {
    'C': np.arange(0.1, 10, 0.5),
    'degree': np.arange(1, 6, 1),
    'gamma': ["scale", "auto"] + np.arange(0.01, 0.5, 0.1).tolist()
}

grid = RandomizedSearchCV(model, params, cv=5)
grid.fit(X_train_skl, y_train)

print("Score: {}".format(grid.best_score_))
grid.best_params_

Score: 0.292


{'gamma': 'scale', 'degree': 3, 'C': 3.1}

In [13]:
model = SVC(kernel = 'poly')
params = {
    'C': np.arange(0.1, 10, 0.5),
    'degree': np.arange(1, 6, 1),
    'gamma': ["scale", "auto"] + np.arange(0.01, 0.5, 0.1).tolist()
}

grid = RandomizedSearchCV(model, params, cv=5)
grid.fit(X_train_skl, y_train)

print("Score: {}".format(grid.best_score_))
grid.best_params_

Score: 0.2997142857142857


{'gamma': 0.21000000000000002, 'degree': 3, 'C': 3.1}

In [14]:
model = SVC(kernel = 'sigmoid')
params = {
    'C': np.arange(0.1, 10, 0.5),
    'degree': np.arange(1, 6, 1),
    'gamma': ["scale", "auto"] + np.arange(0.01, 0.5, 0.1).tolist()
}

grid = RandomizedSearchCV(model, params, cv=5)
grid.fit(X_train_skl, y_train)

print("Score: {}".format(grid.best_score_))
grid.best_params_

Score: 0.26399999999999996


{'gamma': 0.01, 'degree': 5, 'C': 1.6}

Wygrywa minimalnie kernel rbf. Pomimo strojenia parametrów dla innych kerneli nie przewyższyły one automatycznych ustawień SVM().

## Drugi zbiorek
Chciałam znaleźć mecze z Overwatch, ale niestety zbiorki są ubogie z tej dziedziny :(

Dlatego użyjemy zbiorku z openml id = 54 'vehicles'

In [15]:
import openml
df, y, categorical_indicator, attribute_names = openml.datasets.get_dataset(54).get_data()

In [16]:
df.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class
0,95.0,48.0,83.0,178.0,72.0,10.0,162.0,42.0,20.0,159.0,176.0,379.0,184.0,70.0,6.0,16.0,187.0,197.0,van
1,91.0,41.0,84.0,141.0,57.0,9.0,149.0,45.0,19.0,143.0,170.0,330.0,158.0,72.0,9.0,14.0,189.0,199.0,van
2,104.0,50.0,106.0,209.0,66.0,10.0,207.0,32.0,23.0,158.0,223.0,635.0,220.0,73.0,14.0,9.0,188.0,196.0,saab
3,93.0,41.0,82.0,159.0,63.0,9.0,144.0,46.0,19.0,143.0,160.0,309.0,127.0,63.0,6.0,10.0,199.0,207.0,van
4,85.0,44.0,70.0,205.0,103.0,52.0,149.0,45.0,19.0,144.0,241.0,325.0,188.0,127.0,9.0,11.0,180.0,183.0,bus


In [17]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Class'], axis = 1),
                                                    df['Class'], test_size = 0.3)

Zobaczmy jak ma się bez skalowania:

In [18]:
model = SVC()
model.fit(X_train, y_train)
y = model.predict(X_test)
accuracy_score(y_test, y)

0.4881889763779528

I ze skalowaniem:

In [19]:
skl = StandardScaler()

X_train_skl = skl.fit_transform(X_train)
X_test_skl = skl.transform(X_test)

model = SVC()
model.fit(X_train_skl, y_train)
y = model.predict(X_test_skl)
accuracy_score(y_test, y)

0.7913385826771654

Ze skalowaniem widać znaczną różnicę. Zobaczmy jak z grid searchem:

In [25]:
model = SVC()
params = {
    'C': np.arange(0.1, 10, 0.5),
    'degree': np.arange(1, 6, 1),
    'gamma': ["scale", "auto"] + np.arange(0.01, 0.5, 0.1).tolist()
}

grid = RandomizedSearchCV(model, params, cv=5)
grid.fit(X_train_skl, y_train)

print("Score: {}".format(grid.best_score_))
grid.best_params_

Score: 0.7787921948440394


{'gamma': 'scale', 'degree': 2, 'C': 8.1}

Ponownie sprawdzimy inne jądra:

In [21]:
model = SVC(kernel='linear')
params = {
    'C': np.arange(0.1, 10, 0.5),
    'degree': np.arange(1, 6, 1),
    'gamma': ["scale", "auto"] + np.arange(0.01, 0.5, 0.1).tolist()
}

grid = RandomizedSearchCV(model, params, cv=5)
grid.fit(X_train_skl, y_train)

print("Score: {}".format(grid.best_score_))
grid.best_params_

Score: 0.7855148839196696


{'gamma': 0.21000000000000002, 'degree': 2, 'C': 5.6}

In [22]:
model = SVC(kernel = 'poly')
params = {
    'C': np.arange(0.1, 10, 0.5),
    'degree': np.arange(1, 6, 1),
    'gamma': ["scale", "auto"] + np.arange(0.01, 0.5, 0.1).tolist()
}

grid = RandomizedSearchCV(model, params, cv=5)
grid.fit(X_train_skl, y_train)

print("Score: {}".format(grid.best_score_))
grid.best_params_

Score: 0.7854294260076913


{'gamma': 0.21000000000000002, 'degree': 2, 'C': 3.1}

In [23]:
model = SVC(kernel = 'sigmoid')
params = {
    'C': np.arange(0.1, 10, 0.5),
    'degree': np.arange(1, 6, 1),
    'gamma': ["scale", "auto"] + np.arange(0.01, 0.5, 0.1).tolist()
}

grid = RandomizedSearchCV(model, params, cv=5)
grid.fit(X_train_skl, y_train)

print("Score: {}".format(grid.best_score_))
grid.best_params_

Score: 0.6672838626976214


{'gamma': 0.01, 'degree': 3, 'C': 3.6}

Najlepszy okazał się ponownie kernel rbf.