In [1]:
import os
import random
import requests
import warnings
from datetime import datetime

import holidays
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import pickle


# Suppress warnings
warnings.filterwarnings('ignore')

# Statistical and ML libraries
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif

# Time series specific libraries
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from scipy import stats 

# Deep learning libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Define the file paths
path1 = r'C:\Users\LENOVO\MachineLearningProhects\TimeSeriesForecasting_End_To_End\data\timeseries_model_data.csv'
path2 = r'C:\Users\TPWODL\New folder_Content\TimeSeriesForecasting_End_To_End\data\timeseries_model_data.csv'

# Check which path exists and read CSV
if os.path.exists(path1):
    new_df = pd.read_csv(path1) 
elif os.path.exists(path2):
    new_df = pd.read_csv(path2)   
else:
    # Raise a FileNotFoundError if neither path is valid
    raise FileNotFoundError("❌ No valid data file found in either specified path.")

In [3]:
new_df.head(2)

Unnamed: 0,time,temp_max_C,precip_mm,day_of_week,month,wind_speed_max_m_s,Complaint_Count,year,relative_humidity_2m_mean,weather_label,Complaint_Count_diff1
0,2022-06-10,41.0,0.1,4,6,16.6,10,2022,47,1,
1,2022-06-11,41.0,0.2,5,6,25.3,12,2022,42,1,2.0


In [9]:
new_df["time"] = pd.to_datetime(new_df["time"])


In [11]:
df = new_df.drop(columns=['relative_humidity_2m_mean','weather_label','year', 'Complaint_Count_diff1'])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1183 entries, 0 to 1182
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   time                1183 non-null   datetime64[ns]
 1   temp_max_C          1183 non-null   float64       
 2   precip_mm           1183 non-null   float64       
 3   day_of_week         1183 non-null   int64         
 4   month               1183 non-null   int64         
 5   wind_speed_max_m_s  1183 non-null   float64       
 6   Complaint_Count     1183 non-null   int64         
dtypes: datetime64[ns](1), float64(3), int64(3)
memory usage: 64.8 KB


In [15]:
df.set_index('time', inplace=True)

In [17]:
df

Unnamed: 0_level_0,temp_max_C,precip_mm,day_of_week,month,wind_speed_max_m_s,Complaint_Count
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-06-10,41.0,0.1,4,6,16.6,10
2022-06-11,41.0,0.2,5,6,25.3,12
2022-06-12,39.8,0.0,6,6,16.7,5
2022-06-13,40.2,0.0,0,6,13.3,15
2022-06-14,39.2,8.6,1,6,19.0,16
...,...,...,...,...,...,...
2025-10-15,31.4,0.0,2,10,6.1,18
2025-10-16,30.9,0.0,3,10,8.2,18
2025-10-17,31.0,0.0,4,10,7.8,17
2025-10-18,30.5,0.0,5,10,6.3,37
