In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pycaret.classification import setup, compare_models, create_model, predict_model

In [30]:
df = pd.read_csv("weatherAUS.csv")

In [32]:
df.shape

(145460, 23)

In [33]:
df = df.dropna(how="any")

In [34]:
df.shape

(56420, 23)

In [35]:
df["RainToday"] = pd.get_dummies(df["RainToday"], drop_first=True)
df["RainTomorrow"] = pd.get_dummies(df["RainTomorrow"], drop_first=True)

In [37]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [38]:
df.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [39]:
cols = ["Location", "WindGustDir", "WindDir9am", "WindDir3pm"]

In [40]:
for col in cols:
    df[col] = label_encoder(df[col])

Location ['AliceSprings' 'Brisbane' 'Cairns' 'Canberra' 'Cobar' 'CoffsHarbour'
 'Darwin' 'Hobart' 'Melbourne' 'MelbourneAirport' 'Mildura' 'Moree'
 'MountGambier' 'NorfolkIsland' 'Nuriootpa' 'Perth' 'PerthAirport'
 'Portland' 'Sale' 'Sydney' 'SydneyAirport' 'Townsville' 'WaggaWagga'
 'Watsonia' 'Williamtown' 'Woomera']
WindGustDir ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']
WindDir9am ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']
WindDir3pm ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']


In [41]:
df = df.drop(["Date"], axis=1)

In [42]:
X = df.drop(["RainTomorrow"], axis=1)
y = df["RainTomorrow"]

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4242)

In [44]:
model = setup(data=df, target="RainTomorrow")

Unnamed: 0,Description,Value
0,Session id,701
1,Target,RainTomorrow
2,Target type,Binary
3,Original data shape,"(56420, 22)"
4,Transformed data shape,"(56420, 22)"
5,Transformed train set shape,"(39494, 22)"
6,Transformed test set shape,"(16926, 22)"
7,Numeric features,21
8,Preprocess,True
9,Imputation type,simple


In [45]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8662,0.9036,0.5698,0.7626,0.6522,0.5715,0.5808,2.455
lightgbm,Light Gradient Boosting Machine,0.8648,0.8998,0.5685,0.7572,0.6492,0.5676,0.5766,0.114
xgboost,Extreme Gradient Boosting,0.864,0.8986,0.5802,0.746,0.6526,0.5697,0.5767,0.799
rf,Random Forest Classifier,0.862,0.8938,0.5247,0.7767,0.6261,0.5454,0.5612,0.877
et,Extra Trees Classifier,0.86,0.8956,0.5127,0.7756,0.6171,0.5358,0.5531,0.514
gbc,Gradient Boosting Classifier,0.8566,0.8885,0.5293,0.7462,0.6192,0.534,0.546,1.544
ridge,Ridge Classifier,0.8525,0.0,0.4868,0.7572,0.5924,0.5076,0.5261,0.029
lr,Logistic Regression,0.8522,0.8799,0.5248,0.7283,0.6099,0.5217,0.5324,0.419
lda,Linear Discriminant Analysis,0.8519,0.8815,0.5439,0.7154,0.6178,0.528,0.5357,0.04
ada,Ada Boost Classifier,0.8501,0.8783,0.5179,0.723,0.6033,0.514,0.525,0.343


<catboost.core.CatBoostClassifier at 0x29112ebe0>

In [46]:
catb_model = create_model("catboost")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8681,0.904,0.5839,0.7616,0.661,0.5809,0.5888
1,0.8635,0.9009,0.5655,0.7534,0.6461,0.5637,0.5726
2,0.8671,0.903,0.5586,0.7751,0.6493,0.57,0.5816
3,0.8635,0.8993,0.5782,0.7452,0.6511,0.568,0.5751
4,0.8615,0.8973,0.5535,0.7516,0.6375,0.5543,0.5642
5,0.8696,0.9008,0.5839,0.7685,0.6636,0.5846,0.5931
6,0.865,0.9094,0.5506,0.7713,0.6425,0.5622,0.5743
7,0.8676,0.9083,0.5793,0.7625,0.6584,0.5781,0.5865
8,0.8668,0.9074,0.5621,0.7713,0.6503,0.5705,0.5814
9,0.8688,0.9054,0.5828,0.7659,0.6619,0.5824,0.5907


In [47]:
y_pred = predict_model(catb_model, data=X_test)

In [48]:
y_pred

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,prediction_label,prediction_score
66323,9,12.000000,23.000000,0.0,8.0,9.0,8,52.0,12,11,...,40.0,1012.000000,1011.400024,7.0,7.0,16.600000,22.000000,0,0,0.9443
95395,21,22.200001,32.400002,0.0,8.6,12.1,1,46.0,11,1,...,59.0,1008.000000,1005.099976,1.0,3.0,30.000000,31.400000,0,0,0.7843
105904,25,14.100000,27.700001,0.0,14.6,5.0,15,43.0,0,14,...,17.0,1012.599976,1011.000000,3.0,7.0,19.700001,26.299999,0,0,0.9322
13812,11,20.299999,30.799999,0.0,14.2,2.3,1,52.0,1,5,...,31.0,1019.400024,1016.599976,7.0,7.0,22.100000,29.400000,0,0,0.8157
118259,16,7.000000,28.400000,0.0,5.6,10.6,3,35.0,5,15,...,16.0,1026.000000,1023.099976,0.0,1.0,20.700001,26.600000,0,0,0.9983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36957,22,18.799999,22.600000,4.0,6.0,0.8,4,33.0,1,1,...,95.0,1009.000000,1006.799988,8.0,8.0,18.900000,20.000000,1,1,0.9857
79225,23,0.500000,14.500000,0.0,2.0,3.2,3,59.0,4,5,...,63.0,1018.299988,1010.400024,7.0,7.0,6.600000,12.300000,0,1,0.7931
62018,18,6.300000,17.799999,0.0,1.6,4.9,14,28.0,6,15,...,58.0,1031.099976,1028.400024,3.0,5.0,10.500000,17.700001,0,0,0.9671
13304,11,2.100000,20.299999,0.0,2.2,10.2,3,30.0,4,3,...,26.0,1020.500000,1015.299988,0.0,1.0,9.900000,19.900000,0,0,0.9714
