In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
import joblib
from  data_preprocessing import preprocess_data

pd.set_option('display.max_columns', None)

In [2]:
raw_df = pd.read_csv('./data/weatherAUS.csv')
raw_df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [3]:
# Preprocess the data
data, imputer, scaler, encoder, label_encoder, numeric_cols, categorical_cols, input_cols, target_col = preprocess_data(raw_df)

In [4]:
X_train = data['train_X']
train_targets = data['train_y']
X_val = data['val_X']
val_targets = data['val_y']
X_test = data['test_X']
test_targets = data['test_y']

In [5]:
# Random Forest params for Randomized Search
params_rf = {
    'max_depth': [10, 15, 20],
    'n_estimators': [100, 200],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [6]:
# Define RandomizedSearchCV
rf_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    params_rf,
    n_iter=24,
    cv=cv_strategy,
    scoring="f1",
    random_state=42,
    refit=True
)

In [7]:
# Fit on trainig data
rf_search.fit(X_train, train_targets)

In [8]:
rf_search.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_depth': 20}

In [9]:
# Get the best model
rf = rf_search.best_estimator_

In [10]:
def calculate_metrics(model, inputs, targets):
  preds = model.predict(inputs)
  f1 = f1_score(targets, preds)
  accuracy = accuracy_score(targets, preds)
  print("F1 score: {:.2f}".format(f1))
  print("Accuracy score: {:.2f}".format(accuracy))

In [11]:
# Train the best model on the training data 
rf.fit(X_train, train_targets)
# Calculate the metrics
calculate_metrics(rf, X_train, train_targets)
calculate_metrics(rf, X_val, val_targets)

F1 score: 0.82
Accuracy score: 0.93
F1 score: 0.55
Accuracy score: 0.86


In [12]:
aussie_rain = {
    'model': rf,
    'imputer': imputer,
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_col': target_col,
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols
}

In [None]:
# Save the random forest model 
joblib.dump(aussie_rain, 'models/aussie_rain.joblib')