In [1]:
# import libraries

import os
import pandas as pd
import numpy as np
import plotly.express as px

# set dataframe display 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 10000)

In [2]:
# import data

ordered_df = pd.read_csv("data/historical_data.csv")

# only use data that has the full yearly recorded data
df = ordered_df[ordered_df['year'] < 2024]
df = df[df['year'] > 1981]

# check if df starts from 1982
print(df.head())
print("-"*50)

# check if df ends at 2023
print(df.tail())

    station compiled_date  year  month  date  mean_temp  min_temp  max_temp  tot_daily_rf  highest_30_min_rf  highest_60_min_rf  highest_120_min_rf  mean_wind_speed  max_wind_speed
731  Changi    1982-01-01  1982      1     1       25.3      23.0      29.4          22.7                NaN                NaN                 NaN              NaN             NaN
732  Changi    1982-01-02  1982      1     2       24.7      23.5      26.2          31.4                NaN                NaN                 NaN              NaN             NaN
733  Changi    1982-01-03  1982      1     3       25.7      24.0      27.2           0.3                NaN                NaN                 NaN              NaN             NaN
734  Changi    1982-01-04  1982      1     4       26.3      24.1      29.8           0.0                NaN                NaN                 NaN              NaN             NaN
735  Changi    1982-01-05  1982      1     5       25.8      23.5      28.8           3.1      

In [3]:
print(df.head())

    station compiled_date  year  month  date  mean_temp  min_temp  max_temp  tot_daily_rf  highest_30_min_rf  highest_60_min_rf  highest_120_min_rf  mean_wind_speed  max_wind_speed
731  Changi    1982-01-01  1982      1     1       25.3      23.0      29.4          22.7                NaN                NaN                 NaN              NaN             NaN
732  Changi    1982-01-02  1982      1     2       24.7      23.5      26.2          31.4                NaN                NaN                 NaN              NaN             NaN
733  Changi    1982-01-03  1982      1     3       25.7      24.0      27.2           0.3                NaN                NaN                 NaN              NaN             NaN
734  Changi    1982-01-04  1982      1     4       26.3      24.1      29.8           0.0                NaN                NaN                 NaN              NaN             NaN
735  Changi    1982-01-05  1982      1     5       25.8      23.5      28.8           3.1      

In [4]:
print(df.columns)

Index(['station', 'compiled_date', 'year', 'month', 'date', 'mean_temp', 'min_temp', 'max_temp', 'tot_daily_rf', 'highest_30_min_rf', 'highest_60_min_rf', 'highest_120_min_rf', 'mean_wind_speed', 'max_wind_speed'], dtype='object')


## Feature Engineering


In [5]:
fe_df = df.copy()

fe_df["intraday_temp_difference"] = df["max_temp"] - df["min_temp"]
fe_df["intraday_temp_increase"] = df["max_temp"] - df["mean_temp"]
fe_df["intraday_temp_decrease"] = df["min_temp"] - df["mean_temp"]



print(fe_df.tail())

      station compiled_date  year  month  date  mean_temp  min_temp  max_temp  tot_daily_rf  highest_30_min_rf  highest_60_min_rf  highest_120_min_rf  mean_wind_speed  max_wind_speed  intraday_temp_difference  intraday_temp_increase  intraday_temp_decrease
16066  Changi    2023-12-27  2023     12    27       26.3      24.4      28.6          22.2                9.0               11.0                16.0             10.3            42.6                       4.2                     2.3                    -1.9
16067  Changi    2023-12-28  2023     12    28       27.1      25.0      30.8          32.4               21.8               24.6                26.0              8.4            40.7                       5.8                     3.7                    -2.1
16068  Changi    2023-12-29  2023     12    29       27.6      24.9      31.2           8.4                6.2                9.6                10.6             11.6            31.5                       6.3                     

# Questions to answer

1. What date(s) had the greatest & least:
    - rainfall
    - temperature
    - temperature difference to the average temp
    - windspeed
    - greatest downpour (30, 60, 120 mins)

2. What is the trend for:
    - rainfall
    - temperature
    - windspeeds

## Temperature

### Mean Temperature from 1982 to 2023

In [6]:
# Obtaining the statisical values

lowest_temp = df["min_temp"].min()
highest_temp = df["max_temp"].max()

mean_temp = df["mean_temp"].mean()
median_temp = df["mean_temp"].median()

# use np.nanpercentile to ignore nan values
percentile_25 = np.nanpercentile(df["mean_temp"], 25, axis = 0)
percentile_75 = np.nanpercentile(df["mean_temp"], 75)


print(f"""

The lowest recorded temp: {lowest_temp} deg Celcius
The highest recorded temp: {highest_temp} deg Celcius

The mean temperature: {mean_temp:.2f} deg Celcius
The median temperature: {median_temp} deg Celcius


--- not sure how to use it yet ---
The 25th percentile of mean temperature: {percentile_25} deg Celcius
The 75th percentile of mean temperature: {percentile_75} deg Celcius


""")



The lowest recorded temp: 20.2 deg Celcius
The highest recorded temp: 36.0 deg Celcius

The mean temperature: 27.69 deg Celcius
The median temperature: 27.7 deg Celcius


--- not sure how to use it yet ---
The 25th percentile of mean temperature: 26.9 deg Celcius
The 75th percentile of mean temperature: 28.6 deg Celcius





In [7]:
fig = px.line(data_frame = df, x = "compiled_date", y = "mean_temp",
                title = "Mean Temp (1982 - 2023)", range_x = ["01-01-1982", "01-01-2025"],
                labels = {"compiled_date": "Years",
                            "mean_temp": "Mean Temperature"})

# print(fig)

fig.show()


In [8]:
# scatter plot

fig = px.scatter(data_frame = df, x = "year", y = "mean_temp",
                trendline = "ols", trendline_color_override="red",
                title = "Mean Temperature from 1982 to 2023",
                labels = {"mean_temp": "Mean Temperature",
                            "year": "Year"})

# print(fig)

fig.show()

In [9]:
# max temp

fig = px.scatter(data_frame = df, x = "year", y = "max_temp",
                    trendline = "ols", trendline_color_override = "red",
                    labels = {"year": "Year",
                            "max_temp": "Max Temp"},
                    title = "Recorded Max Temperature (1982 - 2023)")

# print(fig)

fig.show()

In [10]:
# min temp

fig = px.scatter(data_frame = df, x = "year", y = "min_temp",
                    trendline = "ols", trendline_color_override = "red",
                    labels = {"year": "Year",
                            "min_temp": "min Temp"},
                    title = "Recorded min Temperature (1982 - 2023)")

# print(fig)

fig.show()

## Checking violations of assumptions

- The true relationship is linear
- Errors are normally distributed
- Homoscedasticity of errors (or, equal variance around the line).
- Independence of the observations

In [17]:
#  some comment
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import kstest

data_2023 = df[df["year"] == 2023]
data_before_2023 = df[df['year'] < 2023]

ks_stat, p_val = kstest(data_before_2023["mean_temp"], "norm", 
                        args = (data_before_2023["mean_temp"].mean(), data_before_2023["mean_temp"].std()))


fig = px.histogram(data_before_2023["mean_temp"])
fig.show()

print(f"Kolmogorov-Smirnov Test: \nStatistic: {ks_stat}\np-value: {p_val}")

Kolmogorov-Smirnov Test: 
Statistic: 0.03845472440831654
p-value: 1.0222003393279843e-19


In [None]:
from scipy.stats import kstest

# Perform the Kolmogorov-Smirnov test
ks_stat, p_value = kstest(data_before_2023["mean_temp"], "norm", args=(data_before_2023["mean_temp"].mean(), data_before_2023["mean_temp"].std()))
print(f"Kolmogorov-Smirnov Test: Statistic={ks_stat}, p-value={p_value}")

In [12]:
nan_values = df[df["mean_temp"].isna()]

print(len(nan_values))
print(nan_values.tail())

0
Empty DataFrame
Columns: [station, compiled_date, year, month, date, mean_temp, min_temp, max_temp, tot_daily_rf, highest_30_min_rf, highest_60_min_rf, highest_120_min_rf, mean_wind_speed, max_wind_speed]
Index: []
