In [1]:
import pandas as pd
import numpy as np

dane = pd.read_csv('../../australia.csv')
dane.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,0,0
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,0,0
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,0,0
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,0,0
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,0,0


In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# scoring
def scoring(y_test, y_predicted):
    print("ACC = ", accuracy_score(y_test, y_predicted))
    print("PREC = ", precision_score(y_test, y_predicted))
    print("RECALL = ", recall_score(y_test, y_predicted))
    print("F1 = ", f1_score(y_test, y_predicted))
    print("AUC = ", roc_auc_score(y_test, y_predicted))

# wyodrebnienie y
def extract_y(data):
    y = data[["RainTomorrow"]]
    return data.drop(["RainTomorrow"], axis=1), y

In [3]:
from sklearn.model_selection import train_test_split

X, y = extract_y(dane)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2137)

In [10]:
# regresja
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000, solver='saga')

lr.fit(X_train, y_train.values.ravel())
y_predicted = lr.predict(X_test)
scoring(y_test, y_predicted)

# max_iter - maksymalna liczba iteracji
# solver - używany algorytm

ACC =  0.8524459411556186
PREC =  0.7268184342032205
RECALL =  0.5273972602739726
F1 =  0.6112537940695775
AUC =  0.7357504365446209




In [12]:
# naive bayes
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB(var_smoothing=1e-1)
nb.fit(X_train, y_train.values.ravel())
y_predicted = nb.predict(X_test)
scoring(y_test, y_predicted)

# var_smoothing - zwiększenie wariancji o część największej wariancji

ACC =  0.8383551931939028
PREC =  0.6717118997912317
RECALL =  0.5185334407735697
F1 =  0.5852660300136425
AUC =  0.7235362045949194


In [13]:
# drzewo decyzyjne
from sklearn.tree import DecisionTreeClassifier

dc = DecisionTreeClassifier(splitter="random")
dc.fit(X_train, y_train)
y_predicted = dc.predict(X_test)
scoring(y_test, y_predicted)

# splitter - decyzja, jak splitować węzeł

ACC =  0.7925381070542361
PREC =  0.5272516428295323
RECALL =  0.5495568090249798
F1 =  0.5381732097060564
AUC =  0.7053055574322808


Niemal we wszystkich statystykach przoduje regresja (poza RECALL względem drzewa decyzyjnego), więc w tym przypadku jest to najlepszy klasyfikator. Drzewo decyzyjne poradziło sobie najgorzej, uwzględniając wszystkie miary poza RECALLem.