In [None]:
# pip install xgboost
# !pip install keras
# !pip install tensorflow
# !pip install sklearn

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM, Dense
from datetime import datetime
import matplotlib.pyplot as plt

# Input the data

In [None]:
data = pd.read_csv("https://filtereddatasets.s3.amazonaws.com/Groceries_retail/Groceriesv2.csv")
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
#check missing
data.isnull().sum()

# EDA

In [None]:
data["Prod_name"].value_counts()
data[data["Prod_name"] == "Pepper - Gypsy Pepper" ]

In [None]:
data["Sale_date"]=pd.to_datetime(data["Sale_date"])
data["Sale_time"]=pd.to_timedelta(data["Sale_time"])

In [None]:
data["Prod_no"]=pd.Categorical(data.Prod_no)
data['Total'] = data['Total_amt'].str.extract(r'(\d+.\d+)').astype('float')

In [None]:
#find the categorialfeatures
list_1=list(data.columns)
list_2 = ['No_of_units','Unit_price','Sale_time',"Total_amt"]
list_new = [e for e in list_1 if e not in list_2]
list_new

In [None]:
data_new=data[list_new]
data_new.info()

In [None]:
# Time Series
data_new["Total"].groupby(data_new["Sale_date"]).sum().plot(kind='line')

In [None]:
data_total=data_new.groupby(["Sale_date"])['Total'].sum().reset_index()
data_total.info()

In [None]:
data_total.dropna(inplace=True)

# Preprocess the data

In [None]:
# split the data
train_size=int(0.8*len(data_total))
train_data = data_total[:train_size]
test_data = data_total[train_size:]

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
train_data_scaled = scaler.fit_transform(train_data['Total'].values.reshape(-1, 1))
test_data_scaled = scaler.transform(test_data['Total'].values.reshape(-1, 1))

In [None]:
def create_sequences(data, seq_length):
    x = []
    y = []

    for i in range(len(data) - seq_length - 1):
        x.append(data[i : (i + seq_length)])
        y.append(data[i + seq_length])

    return np.array(x), np.array(y)

seq_length = 7
X_train, y_train = create_sequences(train_data_scaled, seq_length)
X_test, y_test = create_sequences(test_data_scaled, seq_length)

# Build LSTM Model

In [None]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))
model.compile(loss="mean_squared_error", optimizer="adam")

# Train the model
model.fit(X_train, y_train, epochs=25, batch_size=1, verbose=1)

In [None]:
# Validate the model
y_pred = model.predict(X_test)

# Invert scaling to get the actual sales values
y_validation = scaler.inverse_transform(y_test)
y_pred_1 = scaler.inverse_transform(y_pred)

# Calculate performance metrics
mse = mean_squared_error(y_validation, y_pred_1)
rmse = np.sqrt(mse)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")

In [None]:
def make_predictions(model, last_seq, num_predictions, seq_length):
    predictions = []
    current_seq = last_seq.reshape(1, seq_length, 1)
    
    for _ in range(num_predictions):
        pred = model.predict(current_seq)
        predictions.append(pred)
        
        # Update the current sequence
        current_seq = np.append(current_seq[:, 1:, :], pred)
        current_seq = current_seq.reshape(1, seq_length, 1)
    
    return np.array(predictions)

# Make predictions for the specified period
num_predictions = 7
data_normalized = scaler.fit_transform(data_total['Total'].values.reshape(-1, 1))
last_seq = data_normalized[-seq_length:]
predictions = make_predictions(model, last_seq, num_predictions, seq_length)

In [None]:
array_2d = predictions.reshape(-1, predictions.shape[2])

In [None]:
# Invert scaling to get the actual sales values
predictions = scaler.inverse_transform(array_2d)

# Display the predictions
target_dates = pd.date_range(start='2018-01-01', end='2018-01-07')
for date, prediction in zip(target_dates, predictions):
    print(f"Predicted sales for {date.strftime('%Y-%m-%d')}: {prediction[0]:.2f}")

In [None]:
next_week_sales_lstm=[]
for prediction in predictions:
    next_week_sales_lstm.append(prediction[0]) 

# Result plot

In [None]:
# Create a trace for the actual sales data
trace_actual = go.Scatter(x=data_total['Sale_date'], y=data_total['Total'], name='Actual')

# Create a trace for the predicted sales data
next_week_dates = pd.date_range(start='2018-01-01', end='2018-01-07')
trace_predicted = go.Scatter(x=next_week_dates, y=next_week_sales_lstm, name='Predicted')

# Create the plot layout
layout = go.Layout(title='Actual vs. Predicted Sales Revenue',
                   xaxis=dict(title='Date'),
                   yaxis=dict(title='Sales Revenue'))

# Plot the data and layout using the Figure object
fig = go.Figure(data=[trace_actual, trace_predicted], layout=layout)

# Show the plot
fig.show()