In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
import numpy as np
import pandas as pd
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Data Loading / Preprocessing - For the mock up, the model will use a subset of the dataset choosen

In [9]:
# Load the data

file_path = 'Resources/weatherAUS_Sample_AF.csv'
raw_df = pd.read_csv(file_path)
raw_df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,1/1/2015,Albury,11.4,33.5,0.0,,,WSW,30.0,ESE,...,45.0,14.0,1013.5,1011.0,,,21.0,32.7,No,No
1,1/2/2015,Albury,15.5,39.6,0.0,,,NE,56.0,ESE,...,45.0,12.0,1016.0,1012.4,,,25.6,38.2,No,No
2,1/3/2015,Albury,17.1,38.3,0.0,,,NNE,48.0,NE,...,35.0,19.0,1017.9,1012.3,,,29.2,37.0,No,No
3,1/4/2015,Albury,26.0,33.1,0.0,,,NNE,41.0,ESE,...,46.0,37.0,1013.6,1012.1,8.0,5.0,27.4,30.9,No,No
4,1/5/2015,Albury,19.0,35.2,0.0,,,E,33.0,SSE,...,60.0,34.0,1017.4,1014.7,8.0,,25.6,32.5,No,No


In [10]:
# Check Datatypes

raw_df.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object

## Split "Date" column to year, month and day to prepare for use in regression model

In [11]:
# Date column handling

df = raw_df.copy()

df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.head()


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
0,2015-01-01,Albury,11.4,33.5,0.0,,,WSW,30.0,ESE,...,1011.0,,,21.0,32.7,No,No,2015,1,1
1,2015-01-02,Albury,15.5,39.6,0.0,,,NE,56.0,ESE,...,1012.4,,,25.6,38.2,No,No,2015,1,2
2,2015-01-03,Albury,17.1,38.3,0.0,,,NNE,48.0,NE,...,1012.3,,,29.2,37.0,No,No,2015,1,3
3,2015-01-04,Albury,26.0,33.1,0.0,,,NNE,41.0,ESE,...,1012.1,8.0,5.0,27.4,30.9,No,No,2015,1,4
4,2015-01-05,Albury,19.0,35.2,0.0,,,E,33.0,SSE,...,1014.7,8.0,,25.6,32.5,No,No,2015,1,5


## Update "RainTomorrow" and "RainToday" Columns to Binary Values

In [12]:
df['RainTomorrow'].value_counts()

No     33611
Yes     9594
Name: RainTomorrow, dtype: int64

In [13]:
df['RainToday'].value_counts()

No     33618
Yes     9594
Name: RainToday, dtype: int64

In [14]:
df['RainTomorrow'] = df.RainTomorrow.eq('Yes').mul(1)
df['RainTomorrow'].value_counts()

0    34848
1     9594
Name: RainTomorrow, dtype: int64

In [15]:
df['RainToday'] = df.RainToday.eq('Yes').mul(1)
df['RainToday'].value_counts()

0    34848
1     9594
Name: RainToday, dtype: int64

In [16]:
df.dtypes

Date             datetime64[ns]
Location                 object
MinTemp                 float64
MaxTemp                 float64
Rainfall                float64
Evaporation             float64
Sunshine                float64
WindGustDir              object
WindGustSpeed           float64
WindDir9am               object
WindDir3pm               object
WindSpeed9am            float64
WindSpeed3pm            float64
Humidity9am             float64
Humidity3pm             float64
Pressure9am             float64
Pressure3pm             float64
Cloud9am                float64
Cloud3pm                float64
Temp9am                 float64
Temp3pm                 float64
RainToday                 int32
RainTomorrow              int32
Year                      int64
Month                     int64
Day                       int64
dtype: object

## Drop Unnecessary Columns / Handle Missing Data

In [17]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

columns = ['Date', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm'
        , 'Cloud9am', 'Cloud3pm']
df = df.drop(columns, axis=1)

In [18]:
print(df.shape)
df.head()

(12610, 17)


Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
3632,Moree,22.1,29.4,44.6,43.6,0.8,67.0,46.0,1015.9,1013.6,24.0,28.9,1,0,2015,1,5
3633,Moree,20.7,33.4,0.0,6.0,11.3,54.0,32.0,1017.4,1015.3,25.8,31.8,0,0,2015,1,6
3634,Moree,20.0,33.7,0.0,9.8,13.5,50.0,29.0,1018.1,1014.0,25.0,32.6,0,0,2015,1,7
3638,Moree,25.3,31.8,0.0,35.0,0.0,56.0,37.0,1011.2,1007.6,26.1,31.0,0,1,2015,1,11
3639,Moree,20.7,29.6,2.4,5.8,0.5,89.0,54.0,1010.0,1007.3,21.7,28.7,1,1,2015,1,12


## Encode Columns - Use OneHotEncoder

In [19]:
# Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df_cat

['Location']

In [20]:
df[df_cat].nunique()

Location    22
dtype: int64

In [21]:
# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(df_cat)
encode_df.head()


Unnamed: 0,Location_AliceSprings,Location_Brisbane,Location_Cairns,Location_Darwin,Location_Hobart,Location_Melbourne,Location_MelbourneAirport,Location_Mildura,Location_Moree,Location_MountGambier,...,Location_Perth,Location_PerthAirport,Location_Portland,Location_Sydney,Location_SydneyAirport,Location_Townsville,Location_WaggaWagga,Location_Watsonia,Location_Williamtown,Location_Woomera
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(df_cat, axis=1)
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,...,Location_Perth,Location_PerthAirport,Location_Portland,Location_Sydney,Location_SydneyAirport,Location_Townsville,Location_WaggaWagga,Location_Watsonia,Location_Williamtown,Location_Woomera
3632,22.1,29.4,44.6,43.6,0.8,67.0,46.0,1015.9,1013.6,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3633,20.7,33.4,0.0,6.0,11.3,54.0,32.0,1017.4,1015.3,25.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3634,20.0,33.7,0.0,9.8,13.5,50.0,29.0,1018.1,1014.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3638,25.3,31.8,0.0,35.0,0.0,56.0,37.0,1011.2,1007.6,26.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3639,20.7,29.6,2.4,5.8,0.5,89.0,54.0,1010.0,1007.3,21.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow', 'Year', 'Month', 'Day',
       'Location_AliceSprings', 'Location_Brisbane', 'Location_Cairns',
       'Location_Darwin', 'Location_Hobart', 'Location_Melbourne',
       'Location_MelbourneAirport', 'Location_Mildura', 'Location_Moree',
       'Location_MountGambier', 'Location_NorfolkIsland', 'Location_Nuriootpa',
       'Location_Perth', 'Location_PerthAirport', 'Location_Portland',
       'Location_Sydney', 'Location_SydneyAirport', 'Location_Townsville',
       'Location_WaggaWagga', 'Location_Watsonia', 'Location_Williamtown',
       'Location_Woomera'],
      dtype='object')

## Variable Declaration and Split into Test/Train

In [24]:
# define Target Variable
X = df.copy()
X = X.drop("RainTomorrow", axis=1)
y = df["RainTomorrow"]

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

## Scale the Data

In [25]:
from sklearn.preprocessing import StandardScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Define Machine Learning Model
### We want to evaluate the following three models to find the best fit:
- Random forest
- GBM (Boosting)
- Logistic
####  Balanced Random Forest mockup below

In [32]:
# Create a random forest classifier.

#from sklearn.ensemble import RandomForestClassifier
#rf_model = RandomForestClassifier(n_estimators=128, random_state=42) 

from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=42) 

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
y_pred = rf_model.predict(X_test_scaled)



## Model Validation
- Accuracy Report
- Confusion Matrix
- Classification Report



In [33]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,462,109
Actual 1,41,143


In [34]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
balanced_accuracy_score(y_test, y_pred)

0.7931403715830352