In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

In [2]:
train_data = pd.read_csv("/kaggle/input/fog-daa/train_set.csv")
test_data = pd.read_csv("/kaggle/input/fog-daa/test_set.csv")
val_data = pd.read_csv("/kaggle/input/fog-daa/val_set.csv")

In [3]:
columns_to_drop = ['30min_vsby_km', '1hr_vsby_km', '1.5hr_vsby_km', '2hr_vsby_km', '2.5hr_vsby_km',
                   '6hr_vsby_km', '30min_fog', '1hr_fog', '1.5hr_fog', '2hr_fog', '2.5hr_fog',
                   '3hr_fog', '6hr_fog', '30min_fog_type', '1hr_fog_type', '1.5hr_fog_type',
                   '2hr_fog_type', '2.5hr_fog_type', '3hr_fog_type', '6hr_fog_type', '3hr_onset_cond', 'date_time_IST']


In [4]:
train_data = train_data.drop(columns=columns_to_drop)
val_data = val_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)

In [5]:
combined_data = pd.concat([train_data, val_data])

In [6]:
y_train_val = combined_data['3hr_vsby_km']
y_test = test_data['3hr_vsby_km']

In [7]:
sarima_order = (2, 0, 2)  
seasonal_order = (2, 0, 2, 24) 

In [8]:
sarima_model = SARIMAX(y_train_val, order=sarima_order, seasonal_order=seasonal_order)
sarima_fit = sarima_model.fit()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            9     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  5.65481D-01    |proj g|=  4.82356D-01


 This problem is unconstrained.



At iterate    5    f=  4.90763D-01    |proj g|=  8.32234D-02

At iterate   10    f=  4.70415D-01    |proj g|=  1.97922D-02

At iterate   15    f=  4.70234D-01    |proj g|=  1.43948D-03

At iterate   20    f=  4.67092D-01    |proj g|=  9.49707D-02

At iterate   25    f=  4.64891D-01    |proj g|=  1.98336D-02

At iterate   30    f=  4.64645D-01    |proj g|=  2.42368D-02

At iterate   35    f=  4.64527D-01    |proj g|=  7.55149D-03

At iterate   40    f=  4.63605D-01    |proj g|=  7.76648D-03
  ys=-5.043E-02  -gs= 3.570E-03 BFGS update SKIPPED
  ys=-9.410E-03  -gs= 2.758E-03 BFGS update SKIPPED

At iterate   45    f=  4.59912D-01    |proj g|=  6.97186D-02



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.



At iterate   50    f=  4.59131D-01    |proj g|=  1.03774D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    9     50    108      2     2     0   1.038D-01   4.591D-01
  F =  0.45913068474533475     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 




In [9]:
print(sarima_fit.summary())

                                     SARIMAX Results                                      
Dep. Variable:                        3hr_vsby_km   No. Observations:                95279
Model:             SARIMAX(2, 0, 2)x(2, 0, 2, 24)   Log Likelihood              -43745.513
Date:                            Thu, 18 Jul 2024   AIC                          87509.025
Time:                                    07:43:30   BIC                          87594.206
Sample:                                         0   HQIC                         87534.932
                                          - 95279                                         
Covariance Type:                              opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.8364      0.003    726.402      0.000       1.831       1.841
ar.L2         -0.8548      0.002   

In [None]:
y_pred = sarima_fit.forecast(steps=len(y_test))

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(y_test.reset_index(drop=True), label='Actual Visibility')
plt.plot(y_pred, label='Predicted Visibility')
plt.title('Actual vs Predicted Visibility using SARIMA')
plt.xlabel('Time')
plt.ylabel('Visibility (km)')
plt.legend()
plt.show()

In [None]:
current_visibility = y_test.iloc[0]  # Assuming first value in the test set is current visibility
predicted_next_3hr_visibility = y_pred[-1]  # Last prediction
actual_next_3hr_visibility = test_data['3hr_vsby_km'].iloc[-1]  # Last actual visibility

print(f'Current Visibility: {current_visibility} km')
print(f'Predicted Visibility for next 3 hours: {predicted_next_3hr_visibility} km')
print(f'Actual Visibility for next 3 hours: {actual_next_3hr_visibility} km')