# **Import Libraries**

In [1]:
import time
import os
import zipfile
import math
import warnings

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error
from plotly.subplots import make_subplots

# **Functions**

In [3]:
warnings.filterwarnings('ignore')

### **Outlier Remover**

In [4]:
def remove_outliers(data, columns):
    outliers = []
    for column in columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1

        print("IQR value for column %s is: %s" % (column, IQR))

        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        
        suspected = data.index[(data[column] <= lower_limit) | (data[column] >= upper_limit)]
        data = data.loc[(data[column] > lower_limit) & (data[column] < upper_limit)].copy()
        data.loc[(data[column] < lower_limit), column] = lower_limit
        data.loc[(data[column] > upper_limit), column] = upper_limit
        
        outliers.extend(suspected)
        
    print("Outlier total for column", len(outliers))
    return data

# **Data Preparation**

### **Load Data**

In [5]:
data_bjm = "/content/comparebjm.csv"
data_bjb = "/content/comparebjb.csv"
df_bjm = pd.read_csv(data_bjm)
df_bjb = pd.read_csv(data_bjb)

### **Data Description**

In [6]:
df_bjm.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rainfall (BMKG),197.0,10.652792,16.820589,0.0,0.1,3.5,14.2,110.2
Rainfall (GSMap),243.0,9.249136,13.389517,0.0,0.0,4.23,13.86,86.14


In [7]:
df_bjb.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rainfall (BMKG),216.0,9.450463,15.403023,0.0,0.0,2.7,12.55,89.4
Rainfall (GSMap),243.0,8.717325,14.230762,0.0,0.105,2.64,12.775,117.23


### **Check Missing Values**

In [8]:
df_bjm.isnull().sum()

Date                 0
Rainfall (BMKG)     46
Rainfall (GSMap)     0
dtype: int64

In [9]:
df_bjb.isnull().sum()

Date                 0
Rainfall (BMKG)     27
Rainfall (GSMap)     0
dtype: int64

### **Handling Missing Values**

In [10]:
bjm_bmkg_mean = df_bjm["Rainfall (BMKG)"].mean()
df_bjm["Rainfall (BMKG)"] = df_bjm["Rainfall (BMKG)"].fillna(bjm_bmkg_mean)

In [11]:
bjb_bmkg_mean = df_bjb["Rainfall (BMKG)"].mean()
df_bjb["Rainfall (BMKG)"] = df_bjb["Rainfall (BMKG)"].fillna(bjb_bmkg_mean)

### **Handling Outlier**

In [12]:
feature_to_check = ["Rainfall (BMKG)", "Rainfall (GSMap)"]

In [13]:
# df_bjm = remove_outliers(df_bjm, feature_to_check)

In [14]:
# df_bjb = remove_outliers(df_bjb, feature_to_check)

### **Data Re-Descprition (After Filling Missing Values)**

In [15]:
df_bjm.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rainfall (BMKG),243.0,10.652792,15.137757,0.0,0.3,7.4,10.876396,110.2
Rainfall (GSMap),243.0,9.249136,13.389517,0.0,0.0,4.23,13.86,86.14


In [16]:
df_bjb.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rainfall (BMKG),243.0,9.450463,14.518358,0.0,0.0,3.6,11.1,89.4
Rainfall (GSMap),243.0,8.717325,14.230762,0.0,0.105,2.64,12.775,117.23


### **Change Date Format**

In [17]:
df_bjm["Date"] = pd.to_datetime(df_bjm["Date"])

In [18]:
df_bjb["Date"] = pd.to_datetime(df_bjb["Date"])

# **Data Visualization**

## **Initialization**

### **Table**

In [19]:
bjm_table = {
    "Source"    : ["Rainfall (BMKG)", "Rainfall (GSMap)"],
    "Mean": [df_bjm["Rainfall (BMKG)"].mean(), df_bjm["Rainfall (GSMap)"].mean()],
    "Min": [df_bjm["Rainfall (BMKG)"].min(), df_bjm["Rainfall (GSMap)"].min()],
    "Max"    : [df_bjm["Rainfall (BMKG)"].max(), df_bjm["Rainfall (GSMap)"].max()],
    "StDev"   : [df_bjm["Rainfall (BMKG)"].std(), df_bjm["Rainfall (GSMap)"].std()],
    "RMSE"      : [mean_squared_error(df_bjm["Rainfall (BMKG)"], df_bjm["Rainfall (GSMap)"], squared=False), mean_squared_error(df_bjm["Rainfall (BMKG)"], df_bjm["Rainfall (GSMap)"], squared=False)],
}
table_bjm = pd.DataFrame(bjm_table, columns = bjm_table.keys())

In [20]:
bjb_table = {
    "Source"    : ["Rainfall (BMKG)", "Rainfall (GSMap)"],
    "Mean": [df_bjb["Rainfall (BMKG)"].mean(), df_bjb["Rainfall (GSMap)"].mean()],
    "Min": [df_bjb["Rainfall (BMKG)"].min(), df_bjb["Rainfall (GSMap)"].min()],
    "Max"    : [df_bjb["Rainfall (BMKG)"].max(), df_bjb["Rainfall (GSMap)"].max()],
    "StDev"   : [df_bjb["Rainfall (BMKG)"].std(), df_bjb["Rainfall (GSMap)"].std()],
    "RMSE"      : [mean_squared_error(df_bjb["Rainfall (BMKG)"], df_bjb["Rainfall (GSMap)"], squared=False), mean_squared_error(df_bjb["Rainfall (BMKG)"], df_bjb["Rainfall (GSMap)"], squared=False)],
}
table_bjb = pd.DataFrame(bjb_table, columns = bjb_table.keys())

### **Scatter**

In [21]:
scatter_bjm = px.scatter(df_bjm, x = "Rainfall (BMKG)", y = "Rainfall (GSMap)", trendline = "ols", title = "BMKG vs GSMap Banjarmasin")
scatter_bjb = px.scatter(df_bjb, x = "Rainfall (BMKG)", y = "Rainfall (GSMap)", trendline = "ols", title = "BMKG vs GSMap Banjarbaru")

### **Bar**

In [22]:
df_bjm_monthly = df_bjm.groupby(pd.Grouper(key="Date", freq="MS"))
df_bjm_monthly_mean = df_bjm_monthly.mean()
bjm_months = list(df_bjm_monthly.groups)
bar_bjm = go.Figure(
    data=[
        go.Bar(name = "Rainfall (BMKG)", x = bjm_months, y = df_bjm_monthly_mean["Rainfall (BMKG)"], yaxis = "y1", offsetgroup = 1),
        go.Bar(name = "Rainfall (GSMap)", x = bjm_months, y = df_bjm_monthly_mean["Rainfall (GSMap)"], yaxis = "y2", offsetgroup = 2)
    ],
    layout={
        "yaxis": {"title": "Rainfall (BMKG)"},
        "yaxis2": {"title": "Rainfall (GSMap)", "overlaying": "y1", "side": "right"}
    }
)

In [23]:
df_bjb_monthly = df_bjb.groupby(pd.Grouper(key="Date", freq="MS"))
df_bjb_monthly_mean = df_bjb_monthly.mean()
bjb_months = list(df_bjb_monthly.groups)

In [24]:
bar_bjb = go.Figure(
    data=[
        go.Bar(name = "Rainfall (BMKG)", x = bjb_months, y = df_bjb_monthly_mean["Rainfall (BMKG)"], yaxis = "y1", offsetgroup = 1),
        go.Bar(name = "Rainfall (GSMap)", x = bjb_months, y = df_bjb_monthly_mean["Rainfall (GSMap)"], yaxis = "y2", offsetgroup = 2)
    ],
    layout={
        "yaxis": {"title": "Rainfall (BMKG)"},
        "yaxis2": {"title": "Rainfall (GSMap)", "overlaying": "y1", "side": "right"}
    }
)

## **Visualization**

### **Table**

In [25]:
table_bjm

Unnamed: 0,Source,Mean,Min,Max,StDev,RMSE
0,Rainfall (BMKG),10.652792,0.0,110.2,15.137757,20.12575
1,Rainfall (GSMap),9.249136,0.0,86.14,13.389517,20.12575


In [26]:
table_bjb

Unnamed: 0,Source,Mean,Min,Max,StDev,RMSE
0,Rainfall (BMKG),9.450463,0.0,89.4,14.518358,20.54938
1,Rainfall (GSMap),8.717325,0.0,117.23,14.230762,20.54938


In [27]:
df_bjm_monthly_mean

Unnamed: 0_level_0,Rainfall (BMKG),Rainfall (GSMap)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-10-01,6.38916,7.463871
2021-11-01,10.173519,15.105
2021-12-01,12.285934,9.991613
2022-01-01,13.960488,6.862581
2022-02-01,10.594942,8.015
2022-03-01,16.257082,10.801613
2022-04-01,8.457597,8.071
2022-05-01,7.011741,7.713871


In [28]:
df_bjb_monthly_mean

Unnamed: 0_level_0,Rainfall (BMKG),Rainfall (GSMap)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-10-01,5.151673,5.774839
2021-11-01,10.776698,15.009667
2021-12-01,12.577419,8.778387
2022-01-01,9.364636,6.506129
2022-02-01,9.969659,8.8725
2022-03-01,14.053271,11.035161
2022-04-01,5.991775,7.261
2022-05-01,7.70003,6.671935


### **Scatter**

In [29]:
scatter_bjm.show()

In [30]:
scatter_bjb.show()

### **Bar**

In [31]:
bar_bjm.update_layout(barmode = "group", title = "BMKG vs GSMap Banjarmasin Monthly Average")

In [32]:
bar_bjb.update_layout(barmode = "group", title = "BMKG vs GSMap Banjarmasin Monthly Average")