In [3]:
import pandas as pd
import numpy as np

### Apartments

#### Import danych

In [4]:
apartments = pd.read_csv('apartments.csv')
y = apartments['district']
X = apartments.drop(['Unnamed: 0', 'district'], axis=1)

#### Test modelu bez ustalenia hiperparametrów i skalowania

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=420)

from sklearn import svm

clf = svm.SVC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.21212121212121213

#### Skalowanie danych

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=420)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.30303030303030304

Jak widzimy, accuracy wzrosło o około 50% dzięki skalowaniu, więc uwaga na ten temat w artykule była jak najbardziej słuszna.

#### Ustawianie hiperparametrów

In [8]:
from sklearn.model_selection import RandomizedSearchCV

cost = list(pd.array(range(10))/10)
gamma = ['scale', 'auto']
degree = list(range(10))

param_grid = {'C' : cost,'gamma': gamma, 'degree': degree}


random = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, cv = 10, n_jobs=-1)

random_result = random.fit(X_train, y_train)
# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))


Best: 0.314925 using {'gamma': 'auto', 'degree': 3, 'C': 0.9}


Ustalenie hiperparametrów nieznacznie poprawiło skuteczność modelu

### Australia

#### Import danych

In [14]:
australia = pd.read_csv('australia.csv')
australia = australia.iloc[1:100000,]

Tworzę dodatkową kolumnę zawierającą podział średniej między najwyższą a najniższą temperaturą dobową na kwantyle, aby przetestować model na zmiennej celu która nie jest binarna.

In [15]:
avg_temp = australia[['MinTemp','MaxTemp']].mean(axis=1)
australia = australia.drop(['MinTemp','MaxTemp'], axis = 1)
q1, q2, q3 = np.quantile(avg_temp, q = [0.25, 0.5, 0.75])
print(avg_temp.describe())
y = avg_temp.apply(lambda x: 1 if x < q1
                                                         else  (2 if x < q2
                                                               else (3 if x < q3
                                                                    else 4)))
X = australia
print(q1, q2, q3)

count    56419.000000
mean        18.841851
std          6.261545
min          0.950000
25%         13.800000
50%         18.550000
75%         23.800000
max         38.800000
dtype: float64
13.8 18.55 23.799999999999997


#### Test modelu bez ustalenia hiperparametrów i skalowania

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=420)

from sklearn import svm

clf = svm.SVC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6834416456308072

#### Skalowanie danych

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.33, random_state=420)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8828615929964015

Skalowanie danych znacząco poprawiło działanie modelu.

#### Ustawianie hiperparametrów

In [18]:
from sklearn.model_selection import RandomizedSearchCV

cost = list(pd.array(range(10))/10)
gamma = ['scale', 'auto']
degree = list(range(10))

param_grid = {'C' : cost,'gamma': gamma, 'degree': degree}


random = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, cv = 10, n_jobs=-1)

random_result = random.fit(X_train, y_train)
# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))


Best: 0.884974 using {'gamma': 'scale', 'degree': 9, 'C': 0.9}


Ustalenie hiperparametrów minimalnie poprawiło skuteczność modelu