In [20]:
import pandas as pd 
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import random
import warnings 
warnings.filterwarnings('ignore')

In [9]:
dane = pd.read_csv('australia.csv')
dane = pd.DataFrame(dane)

In [10]:
dane.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,0,0
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,0,0
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,0,0
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,0,0
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,0,0


In [11]:
dane["RainTomorrow"].value_counts()

0    43993
1    12427
Name: RainTomorrow, dtype: int64

W przypadku prognozy pogody ważne będzie, żeby przewidywać te dni, w których będzie padać. Należy znaleźć je i poprawnie zidentyfikować, dlatego dobrymi miarami byłyby precision i recall.

# Podział zbioru

In [16]:
X = dane.drop(["RainTomorrow"], axis=1)
y = dane[["RainTomorrow"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2282283)
print(X.shape, X_train.shape, X_test.shape)

(56420, 17) (50778, 17) (5642, 17)


# Las losowy

In [23]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=1000,
                                  max_depth=10,
                                  max_features = 5,
                                  random_state=22,
                                  n_jobs = -1)
model_rf.fit(X_train, y_train)
y_predicted = model_rf.predict(X_test)
print("Precision: ", precision_score(y_test, y_predicted))
print("Recall: ", recall_score(y_test, y_predicted))
print("ROC AUC: ", roc_auc_score(y_test, y_predicted))

Precision:  0.7642857142857142
Recall:  0.5019546520719312
ROC AUC:  0.728286516959642


# Nearest Centroid Classifier

In [24]:
from sklearn.neighbors import NearestCentroid
model_nc = NearestCentroid(metric = 'cosine')
model_nc.fit(X_train, y_train)
y_predicted = model_nc.predict(X_test)
print("Precision: ", precision_score(y_test, y_predicted))
print("Recall: ", recall_score(y_test, y_predicted))
print("ROC AUC: ", roc_auc_score(y_test, y_predicted))

Precision:  0.45723684210526316
Recall:  0.7607505863956215
ROC AUC:  0.7480122402525896


# Gaussian Naive Bayes

In [26]:
from sklearn.svm import SVC
model_svm = SVC(kernel = 'poly', degree = 8)
model_svm.fit(X_train, y_train)
y_predicted = model_svm.predict(X_test)
print("Precision: ", precision_score(y_test, y_predicted))
print("Recall: ", recall_score(y_test, y_predicted))
print("ROC AUC: ", roc_auc_score(y_test, y_predicted))

Precision:  0.7644171779141105
Recall:  0.4870992963252541
ROC AUC:  0.7215464393613436


# Podsumowanie

Wszystkie modele miały podobną marę ROC AUC, NC miał najlepszy recall, a Random Forest najlepsze Precision. Trudnojednoznacznie określić, któy jest lepszy, ale osobiście wybrałbym NC, ponieważ (wg mnie) lepiej jest wziąć parasolkę bądź kurtkę niepotrzebnie, niż dać się zaskoczyć deszczową pogodą, a NC poprawnie zidentyfikował około 76% deszczowych dni ze zbioru testowego, w porównaniu do około 50% dla GNB i RF.