## Import Libraries

In [43]:
import warnings
warnings.filterwarnings('ignore')

In [44]:
import pandas as pd
import numpy as np
import datetime as dt

from sqlalchemy import create_engine
import psycopg2

from config import db_password

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## DataSet Import from Postgres

In [45]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Rainfall_In_Australia"
engine = create_engine(db_string)
df = pd.read_sql_query('select * from "weather_aus_subset"',con=engine)
df.head()

Unnamed: 0,date,location,mintemp,maxtemp,rainfall,humidity9am,humidity3pm,pressure9am,pressure3pm,temp9am,temp3pm,raintoday,raintomorrow
0,2009-01-01,Cobar,17.9,35.2,0.0,20.0,13.0,1006.3,1004.4,26.6,33.4,No,No
1,2009-01-02,Cobar,18.4,28.9,0.0,30.0,8.0,1012.9,1012.1,20.3,27.0,No,No
2,2009-01-04,Cobar,19.4,37.6,0.0,42.0,22.0,1012.3,1009.2,28.7,34.9,No,No
3,2009-01-05,Cobar,21.9,38.4,0.0,37.0,22.0,1012.7,1009.1,29.1,35.6,No,No
4,2009-01-06,Cobar,24.2,41.0,0.0,19.0,15.0,1010.7,1007.4,33.6,37.6,No,No


## Split "Date" column to year, month and day to prepare for use in regression model

In [46]:
# Date column handling

df['date'] = pd.to_datetime(df['date'])
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df = df.drop('date', axis=1)
df.head()

Unnamed: 0,location,mintemp,maxtemp,rainfall,humidity9am,humidity3pm,pressure9am,pressure3pm,temp9am,temp3pm,raintoday,raintomorrow,Year,Month,Day
0,Cobar,17.9,35.2,0.0,20.0,13.0,1006.3,1004.4,26.6,33.4,No,No,2009,1,1
1,Cobar,18.4,28.9,0.0,30.0,8.0,1012.9,1012.1,20.3,27.0,No,No,2009,1,2
2,Cobar,19.4,37.6,0.0,42.0,22.0,1012.3,1009.2,28.7,34.9,No,No,2009,1,4
3,Cobar,21.9,38.4,0.0,37.0,22.0,1012.7,1009.1,29.1,35.6,No,No,2009,1,5
4,Cobar,24.2,41.0,0.0,19.0,15.0,1010.7,1007.4,33.6,37.6,No,No,2009,1,6


## Update "RainTomorrow" and "RainToday" Columns to Binary Values

In [47]:
df['raintomorrow'].value_counts()

No     43993
Yes    12427
Name: raintomorrow, dtype: int64

In [48]:
df['raintoday'].value_counts()

No     43958
Yes    12462
Name: raintoday, dtype: int64

In [49]:
df['raintomorrow'] = df.raintomorrow.eq('Yes').mul(1)
df['raintomorrow'].value_counts()

0    43993
1    12427
Name: raintomorrow, dtype: int64

In [50]:
df['raintoday'] = df.raintoday.eq('Yes').mul(1)
df['raintoday'].value_counts()

0    43958
1    12462
Name: raintoday, dtype: int64

In [51]:
df.dtypes

location         object
mintemp         float64
maxtemp         float64
rainfall        float64
humidity9am     float64
humidity3pm     float64
pressure9am     float64
pressure3pm     float64
temp9am         float64
temp3pm         float64
raintoday         int32
raintomorrow      int32
Year              int64
Month             int64
Day               int64
dtype: object

In [52]:
print(df.shape)
df.head()

(56420, 15)


Unnamed: 0,location,mintemp,maxtemp,rainfall,humidity9am,humidity3pm,pressure9am,pressure3pm,temp9am,temp3pm,raintoday,raintomorrow,Year,Month,Day
0,Cobar,17.9,35.2,0.0,20.0,13.0,1006.3,1004.4,26.6,33.4,0,0,2009,1,1
1,Cobar,18.4,28.9,0.0,30.0,8.0,1012.9,1012.1,20.3,27.0,0,0,2009,1,2
2,Cobar,19.4,37.6,0.0,42.0,22.0,1012.3,1009.2,28.7,34.9,0,0,2009,1,4
3,Cobar,21.9,38.4,0.0,37.0,22.0,1012.7,1009.1,29.1,35.6,0,0,2009,1,5
4,Cobar,24.2,41.0,0.0,19.0,15.0,1010.7,1007.4,33.6,37.6,0,0,2009,1,6


## Encode Columns - Use OneHotEncoder

In [53]:
# Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df_cat

['location']

In [54]:
df[df_cat].nunique()

location    26
dtype: int64

In [55]:
# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(df_cat)
encode_df.head()

Unnamed: 0,location_AliceSprings,location_Brisbane,location_Cairns,location_Canberra,location_Cobar,location_CoffsHarbour,location_Darwin,location_Hobart,location_Melbourne,location_MelbourneAirport,...,location_PerthAirport,location_Portland,location_Sale,location_Sydney,location_SydneyAirport,location_Townsville,location_WaggaWagga,location_Watsonia,location_Williamtown,location_Woomera
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(df_cat, axis=1)
df.head()

Unnamed: 0,mintemp,maxtemp,rainfall,humidity9am,humidity3pm,pressure9am,pressure3pm,temp9am,temp3pm,raintoday,...,location_PerthAirport,location_Portland,location_Sale,location_Sydney,location_SydneyAirport,location_Townsville,location_WaggaWagga,location_Watsonia,location_Williamtown,location_Woomera
0,17.9,35.2,0.0,20.0,13.0,1006.3,1004.4,26.6,33.4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18.4,28.9,0.0,30.0,8.0,1012.9,1012.1,20.3,27.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19.4,37.6,0.0,42.0,22.0,1012.3,1009.2,28.7,34.9,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21.9,38.4,0.0,37.0,22.0,1012.7,1009.1,29.1,35.6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,24.2,41.0,0.0,19.0,15.0,1010.7,1007.4,33.6,37.6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
df.columns

Index(['mintemp', 'maxtemp', 'rainfall', 'humidity9am', 'humidity3pm',
       'pressure9am', 'pressure3pm', 'temp9am', 'temp3pm', 'raintoday',
       'raintomorrow', 'Year', 'Month', 'Day', 'location_AliceSprings',
       'location_Brisbane', 'location_Cairns', 'location_Canberra',
       'location_Cobar', 'location_CoffsHarbour', 'location_Darwin',
       'location_Hobart', 'location_Melbourne', 'location_MelbourneAirport',
       'location_Mildura', 'location_Moree', 'location_MountGambier',
       'location_NorfolkIsland', 'location_Nuriootpa', 'location_Perth',
       'location_PerthAirport', 'location_Portland', 'location_Sale',
       'location_Sydney', 'location_SydneyAirport', 'location_Townsville',
       'location_WaggaWagga', 'location_Watsonia', 'location_Williamtown',
       'location_Woomera'],
      dtype='object')

## Variable Declaration and Split into Test/Train

In [58]:
# define Target Variable
X = df.copy()
X = X.drop("raintomorrow", axis=1)
y = df["raintomorrow"]

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

## Scale the Data

In [59]:
from sklearn.preprocessing import StandardScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Define Machine Learning Model
### We want to evaluate the following three models to find the best fit:
- Random forest
- GBM (Boosting)
- Logistic

## Model Validation
- Accuracy Report
- Confusion Matrix
- Classification Report