In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller, acf, pacf
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [None]:
# Load the data
train_df = pd.read_csv('/home/enat/Downloads/rossmann-store-sales/train.csv')

# Display the first few rows of the dataset
print("Train Data:")
print(train_df.head())

# Isolate the 'Date' and 'Sales' columns
time_series_data = train_df[['Date', 'Sales']]
time_series_data['Date'] = pd.to_datetime(time_series_data['Date'])
time_series_data.set_index('Date', inplace=True)

# Resample the data to daily sales
daily_sales = time_series_data['Sales'].resample('D').sum()

# Display the resampled data
print("Daily Sales Data:")
print(daily_sales.head())


In [None]:
# Check if time series data is stationary
def check_stationarity(timeseries):
    # Perform Dickey-Fuller test
    result = adfuller(timeseries)
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])
    for key, value in result[4].items():
        print('Critical Values:')
        print(f'   {key}, {value}')

check_stationarity(daily_sales)


In [None]:
# Differencing the data if necessary
daily_sales_diff = daily_sales.diff().dropna()
check_stationarity(daily_sales_diff)


In [None]:
#Check ACF and PACF
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.plot(acf(daily_sales_diff, nlags=50))
plt.title('Autocorrelation Function')
plt.subplot(122)
plt.plot(pacf(daily_sales_diff, nlags=50))
plt.title('Partial Autocorrelation Function')
plt.show()


In [None]:
# Create supervised learning data
def create_supervised_data(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:(i + window_size)])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

window_size = 30  # Example window size
scaled_sales = MinMaxScaler(feature_range=(-1, 1)).fit_transform(daily_sales_diff.values.reshape(-1, 1))
X, y = create_supervised_data(scaled_sales, window_size)

# Split the data into training and testing sets
split_index = int(0.8 * len(X))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


In [None]:
#Build and train the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(window_size, 1)))
model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# Make predictions and evaluate the model
y_pred = model.predict(X_test)

# Inverse transform the predictions and actual values
y_pred_inverse = MinMaxScaler().fit(daily_sales_diff.values.reshape(-1, 1)).inverse_transform(y_pred)
y_test_inverse = MinMaxScaler().fit(daily_sales_diff.values.reshape(-1, 1)).inverse_transform(y_test.reshape(-1, 1))

# Plot predictions vs actual values
plt.figure(figsize=(12, 6))
plt.plot(y_test_inverse, label='Actual Sales')
plt.plot(y_pred_inverse, label='Predicted Sales')
plt.title('Actual vs Predicted Sales')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.legend()
plt.show()

# Calculate mean squared error
mse = mean_squared_error(y_test_inverse, y_pred_inverse)
print(f"Mean Squared Error: {mse}")
