In [7]:
# Dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

from re import X

In [8]:
#  Import and read the rainfall.csv
rainfall_df = pd.read_csv('data/rainfall.csv')
rainfall_df.head()

Unnamed: 0,Bureau of Meteorology station number,Location,Latitude,Longitude,Product code,Year,Month,Day,Rainfall amount (millimetres),Period over which rainfall was measured (days),Quality,Date,Rainfall category
0,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,1,0.0,0.0,Y,2012-01-01,0
1,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,2,0.0,0.0,Y,2012-01-02,0
2,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,3,0.0,0.0,Y,2012-01-03,0
3,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,4,4.2,1.0,Y,2012-01-04,1
4,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,5,1.0,1.0,Y,2012-01-05,0


In [9]:
# Convert categorical data to numeric with `pd.get_dummies`
rainfall_df = pd.get_dummies(rainfall_df,dtype=float)
rainfall_df.head()

Unnamed: 0,Bureau of Meteorology station number,Latitude,Longitude,Year,Month,Day,Rainfall amount (millimetres),Period over which rainfall was measured (days),Rainfall category,Location_Dandenong,...,Date_2023-09-02,Date_2023-09-03,Date_2023-09-04,Date_2023-09-05,Date_2023-09-06,Date_2023-09-07,Date_2023-09-08,Date_2023-09-09,Date_2023-09-10,Date_2023-09-11
0,86072,-37.88,145.42,2012,1,1,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,86072,-37.88,145.42,2012,1,2,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,86072,-37.88,145.42,2012,1,3,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,86072,-37.88,145.42,2012,1,4,4.2,1.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,86072,-37.88,145.42,2012,1,5,1.0,1.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Split our preprocessed data into our features and target arrays
y = rainfall_df['Rainfall category']
X = rainfall_df.drop(columns='Rainfall category')

In [11]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [12]:
# Create a StandardScaler instances
sc = StandardScaler()

X_train2 = pd.DataFrame(sc.fit_transform(X_train))
X_test2 = pd.DataFrame(sc.transform(X_test))
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values
X_train2.index = X_train.index.values
X_test2.index = X_test.index.values
X_train = X_train2
X_test = X_test2

In [13]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(random_state = 42)
lr_classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = lr_classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

results = pd.DataFrame([['Logistic Regression', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results)
print('Logistic Regression Training Accuracy : ',
      metrics.accuracy_score(y_train,
                             lr_classifier.predict(X_train))*100)
print('Logistics Regression Validation Accuracy : ',
      metrics.accuracy_score(y_test,
                             lr_classifier.predict(X_test))*100)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.933792   0.907019  0.828476   0.86597
Logistic Regression Training Accuracy :  99.98157021747144
Logistics Regression Validation Accuracy :  93.37919174548581


In [14]:

## SVM (Support Vector Machines) using Support Vector Classifier
from sklearn.svm import SVC
svc_classifier = SVC(random_state = 7, kernel = 'linear')
svc_classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = svc_classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

model_results = pd.DataFrame([['SVM (Linear)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

results = results.append(model_results, ignore_index = True)
print(results)
print('SVM Training Accuracy : ',
      metrics.accuracy_score(y_train,
                             svc_classifier.predict(X_train))*100)
print('SVM Validation Accuracy : ',
      metrics.accuracy_score(y_test,
                             svc_classifier.predict(X_test))*100)

  results = results.append(model_results, ignore_index = True)


                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.933792   0.907019  0.828476   0.86597
1         SVM (Linear)  0.992906   0.989933  0.982515   0.98621
SVM Training Accuracy :  99.90785108735717
SVM Validation Accuracy :  99.29062768701634


In [15]:
## Randomforest
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state = 10, n_estimators = 100,
                                    criterion = 'entropy')
rf_classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = rf_classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

rf_classifier_results = pd.DataFrame([['Random Forest (n=100)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

results = results.append(rf_classifier_results, ignore_index = True)
print(results)
print('Random Forest Training Accuracy : ',
      metrics.accuracy_score(y_train,
                             rf_classifier.predict(X_train))*100)
print('Random Forest Validation Accuracy : ',
      metrics.accuracy_score(y_test,
                             rf_classifier.predict(X_test))*100)

  results = results.append(rf_classifier_results, ignore_index = True)


                   Model  Accuracy  Precision    Recall  F1 Score
0    Logistic Regression  0.933792   0.907019  0.828476   0.86597
1           SVM (Linear)  0.992906   0.989933  0.982515   0.98621
2  Random Forest (n=100)  1.000000   1.000000  1.000000   1.00000
Random Forest Training Accuracy :  100.0
Random Forest Validation Accuracy :  100.0
