In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from pmdarima.arima import auto_arima, ADFTest
from statsmodels.tsa import stattools
from statsmodels.tsa.statespace.sarimax import SARIMAX 
from sklearn.model_selection import train_test_split

In [2]:
root_dir = os.getcwd()
datafile = os.path.join(root_dir, 'weatherdata_final.csv')

In [3]:
df = pd.read_csv(datafile)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1916800 entries, 0 to 1916799
Data columns (total 7 columns):
 #   Column  Dtype  
---  ------  -----  
 0   lat     float64
 1   lon     float64
 2   date    object 
 3   temp    float64
 4   prec    float64
 5   sph     float64
 6   sp      float64
dtypes: float64(6), object(1)
memory usage: 102.4+ MB


In [4]:
# Specify the latitude and longitude pair you want to select
selected_lat = 26.75
selected_lon = 80.25

# Filter the DataFrame to select data for the specified lat and lon
selected_data = df[(df['lat'] == selected_lat) & (df['lon'] == selected_lon)]
selected_data['date'] = pd.to_datetime(selected_data["date"])
selected_data.set_index("date", inplace=True)
selected_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14975 entries, 1982-01-01 to 2022-12-31
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   lat     14975 non-null  float64
 1   lon     14975 non-null  float64
 2   temp    14975 non-null  float64
 3   prec    14975 non-null  float64
 4   sph     14975 non-null  float64
 5   sp      14975 non-null  float64
dtypes: float64(6)
memory usage: 818.9 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['date'] = pd.to_datetime(selected_data["date"])


In [6]:

daily_data = selected_data.resample('1D').mean()

In [8]:
correlations = daily_data.corr()
correlations

Unnamed: 0,lat,lon,temp,prec,sph,sp
lat,,,,,,
lon,,,,,,
temp,,,1.0,0.100946,0.42411,-0.593301
prec,,,0.100946,1.0,0.441818,-0.204365
sph,,,0.42411,0.441818,1.0,-0.461113
sp,,,-0.593301,-0.204365,-0.461113,1.0


In [14]:
autocorrelation = stattools.acf(daily_data['temp'], nlags=87)
autocorrelation

array([ 1.        ,  0.98700059,  0.97052343,  0.95821788,  0.95046175,
        0.94489924,  0.9401569 ,  0.93558191,  0.93097714,  0.92617907,
        0.92101587,  0.91598761,  0.91100824,  0.9060406 ,  0.90079426,
        0.89501612,  0.88903357,  0.88284537,  0.87588936,  0.86862441,
        0.86130794,  0.85422005,  0.84676467,  0.83897906,  0.83061947,
        0.82197024,  0.8131473 ,  0.80442318,  0.79568581,  0.78660735,
        0.77718617,  0.76724861,  0.75653564,  0.74566594,  0.73471176,
        0.7237127 ,  0.71267028,  0.70141075,  0.6894745 ,  0.6772658 ,
        0.66484483,  0.65249093,  0.63975411,  0.62692148,  0.61398523,
        0.6012331 ,  0.58825692,  0.57495564,  0.56133373,  0.54740033,
        0.53370423,  0.52032779,  0.50700656,  0.49311139,  0.47871502,
        0.46432891,  0.45000074,  0.43576979,  0.42148624,  0.40684639,
        0.39195177,  0.37709384,  0.36271308,  0.3483334 ,  0.33386409,
        0.3189868 ,  0.3039978 ,  0.28884916,  0.27354385,  0.25

In [15]:
adf_test = ADFTest(daily_data['temp'])
adf_test.should_diff(daily_data['temp'])

(0.01,
 date
 1982-01-01    False
 1982-01-02    False
 1982-01-03    False
 1982-01-04    False
 1982-01-05    False
               ...  
 2022-12-27    False
 2022-12-28    False
 2022-12-29    False
 2022-12-30    False
 2022-12-31    False
 Freq: D, Name: temp, Length: 14975, dtype: bool)

In [28]:
differenced_data = daily_data['temp'].diff(periods=7).dropna()

In [26]:
pacf = stattools.pacf(daily_data['temp'], nlags=10)
pacf

array([ 1.        ,  0.98706651, -0.1419168 ,  0.17558948,  0.127181  ,
        0.07005452,  0.06241822,  0.04358866,  0.03203797,  0.01778778,
        0.00434299])

In [29]:
kpss_stat, pvalue, critical_value, critical_values = stattools.kpss(differenced_data, regression="c")
print(f'KPSS statistic: {kpss_stat}')
print(f'p-value: {pvalue}')
print(f'Critial Values: {critical_value}')
print(f'Critial Values: {critical_values}')

KPSS statistic: 0.022469968958192733
p-value: 0.1
Critial Values: 11
Critial Values: {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739}


look-up table. The actual p-value is greater than the p-value returned.



In [32]:
model  = auto_arima(differenced_data, order=(1, 1, 1), seasonal_order=(0, 1, 1, 365), max_order=5, trace=True)

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=54443.584, Time=6.37 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=71128.235, Time=0.19 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=55972.117, Time=0.43 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=60663.181, Time=1.17 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=71126.238, Time=0.11 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=54881.882, Time=1.98 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=54885.399, Time=1.94 sec
 ARIMA(3,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=14.28 sec
 ARIMA(2,0,3)(0,0,0)[0] intercept   : AIC=54750.348, Time=5.77 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=54931.378, Time=1.13 sec
 ARIMA(1,0,3)(0,0,0)[0] intercept   : AIC=54883.311, Time=2.90 sec
 ARIMA(3,0,1)(0,0,0)[0] intercept   : AIC=54887.371, Time=4.14 sec
 ARIMA(3,0,3)(0,0,0)[0] intercept   : AIC=54434.518, Time=10.84 sec
 ARIMA(4,0,3)(0,0,0)[0] intercept   : AIC=inf, Time=17.94 sec
 ARIMA(3,0,4)(0,0,0)[0] inte

In [31]:
model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,14968.0
Model:,"SARIMAX(3, 0, 3)",Log Likelihood,-27209.259
Date:,"Tue, 05 Sep 2023",AIC,54432.519
Time:,22:28:50,BIC,54485.815
Sample:,01-08-1982,HQIC,54450.204
,- 12-31-2022,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.5513,0.051,10.899,0.000,0.452,0.650
ar.L2,0.7682,0.018,41.534,0.000,0.732,0.804
ar.L3,-0.4772,0.028,-16.835,0.000,-0.533,-0.422
ma.L1,0.5326,0.053,10.025,0.000,0.428,0.637
ma.L2,-0.6046,0.074,-8.219,0.000,-0.749,-0.460
ma.L3,-0.3531,0.027,-13.307,0.000,-0.405,-0.301
sigma2,2.2204,0.010,230.632,0.000,2.202,2.239

0,1,2,3
Ljung-Box (L1) (Q):,19.97,Jarque-Bera (JB):,239217.03
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,1.23,Skew:,-0.16
Prob(H) (two-sided):,0.0,Kurtosis:,22.58


In [None]:


final_model = SARIMAX(differenced_data, order=(3,1,3), seasonal_order=(1, 2, 0, 365)).fit()

In [None]:
prediction_from_final_model = final_model.predict(start="2020-12-08", end="2022-12-31")
prediction_from_final_model = pd.DataFrame(prediction_from_final_model)

In [None]:
prediction_from_final_model.info()

In [None]:
testing_data = total_data["temp"]["2020-12-08":]
testing_data = pd.DataFrame(testing_data)
testing_data.info()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(testing_data.index, testing_data, label='Observed')
plt.plot(prediction_from_final_model.index, prediction_from_final_model, label='Predicted')
plt.legend(loc='upper left')
plt.show()