# Australian weather

## Przygotowanie

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

np.random.seed(123)

In [12]:
data = pd.read_csv('../../australia.csv')

In [13]:
data.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,0,0
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,0,0
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,0,0
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,0,0
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,0,0


In [14]:
y = np.array(data['RainTomorrow'])
X = data.drop(['RainTomorrow'], axis=1)

## Podział zbioru

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

## Trening klasyfikatorów

### MLPClassifier

In [24]:
from sklearn.neural_network import MLPClassifier

In [25]:
mlp = MLPClassifier(learning_rate = 'adaptive')

In [26]:
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

### Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rfc = RandomForestClassifier(max_depth=15, n_estimators=20, max_features=2)

In [33]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### KNN

In [32]:
from sklearn.neighbors import KNeighborsClassifier

In [34]:
knn = KNeighborsClassifier(10)

In [35]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

## Przewidywanie

In [36]:
y_mlp = mlp.predict(X_test)

In [41]:
y_knn = knn.predict(X_test)

In [42]:
y_rfc = rfc.predict(X_test)

## Ocena

In [58]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

#### Accuracy

In [48]:
accuracy_score(y_test, y_mlp)

0.8169088975540588

In [49]:
accuracy_score(y_test, y_knn)

0.8497873094647288

In [51]:
accuracy_score(y_test, y_rfc)

0.8520028358738037

#### F1 score

In [53]:
f1_score(y_test, y_mlp)

0.32922077922077925

In [54]:
f1_score(y_test, y_knn)

0.5803416687298837

In [56]:
f1_score(y_test, y_rfc)

0.5966183574879228

#### ROC AUC

In [59]:
roc_auc_score(y_test, y_mlp)

0.5972130310667838

In [61]:
roc_auc_score(y_test, y_knn)

0.7135883530407299

In [63]:
roc_auc_score(y_test, y_rfc)

0.7240359098113911

### Wnioski

Jak widać sprawdzając każdą z powyższych miar, najlepiej spisał się klasyfikator 'Random Forest'. Sprawdzanie 'accuracy' w tym przypadku nie jest do końca miarodajne, ponieważ zmienna celu jest dość niezbalansowana. Według mnie algorytm najlepiej ocenia w tym wypadku 'F1'. Z wynikiem około 0.6 nie jest to może klasyfikator idealny, ale wciąż lepszy od palca na wietrze ;)