<a href="https://colab.research.google.com/github/wendirad/weekFour/blob/task-3/notebooks/task_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
import datetime as datetime

In [22]:
clean_data = pd.read_csv('/content/clean_data.csv')

In [23]:
clean_data.head()

Unnamed: 0,Date,Store,Store_Type,Store_Status,Promo,Promo2,School_Holiday,Customers,Sales,DayOfWeek,is_holiday
0,2021-01-01,1,supermarket,1,0,0,0,124,0,4,1
1,2021-01-02,1,pharmacy,1,0,0,0,87,0,5,0
2,2021-01-03,1,supermarket,1,1,0,0,74,778,6,0
3,2021-01-04,1,supermarket,0,1,0,0,0,0,0,0
4,2021-01-05,1,pharmacy,1,1,0,0,76,1002,1,0


In [4]:
X = clean_data.drop("Sales", axis=1)
y = clean_data["Sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(n_estimators=50, random_state=42)),
    ]
)

pipeline.fit(X_train, y_train)

cross_val_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
print(f"Cross-validated MAE: {-cross_val_scores.mean()}")

Cross-validated MAE: 50.11510273972603


In [24]:
clean_data['Date'] = pd.to_datetime(clean_data['Date'])
clean_data['Year'] = clean_data['Date'].dt.year
clean_data['Month'] = clean_data['Date'].dt.month
clean_data['WeekOfYear'] = clean_data['Date'].dt.isocalendar().week
clean_data['DayOfMonth'] = clean_data['Date'].dt.day
clean_data['IsWeekend'] = clean_data['DayOfWeek'].isin([6, 7]).astype(int)  # 1 for Saturday and Sunday
clean_data['IsMonthStart'] = clean_data['Date'].dt.is_month_start.astype(int)
clean_data['IsMonthEnd'] = clean_data['Date'].dt.is_month_end.astype(int)
clean_data['PromoDuration'] = clean_data.groupby('Store')['Promo'].cumsum()  # Count consecutive promo days
clean_data['PromoOverlap'] = ((clean_data['Promo'] == 1) & (clean_data['Promo2'] == 1)).astype(int)

In [4]:
clean_data = clean_data.sort_values("Date")
clean_data = clean_data[["Date", "Sales"]].set_index("Date")

In [5]:
clean_data["Sales"] = pd.to_numeric(clean_data["Sales"], errors="coerce")
clean_data = clean_data.dropna()

In [6]:
result = adfuller(clean_data["Sales"])
print(f"ADF Statistic: {result[0]}")
print(f"p-value: {result[1]}")
if result[1] > 0.05:
    print("clean_data is not stationary. Differencing the clean_data.")
    clean_data["Sales_diff"] = clean_data["Sales"].diff().dropna()
else:
    print("clean_data is stationary.")


ADF Statistic: -105.63828297796654
p-value: 0.0
clean_data is stationary.


In [25]:
data = pd.DataFrame(clean_data)

In [26]:
data["Date"] = pd.to_datetime(clean_data["Date"])
data["is_weekend"] = clean_data["DayOfWeek"].isin([5, 6]).astype(int)
holidays = ["2021-01-01", "2021-01-03"]
holidays = [pd.Timestamp(h) for h in holidays]
holidays = pd.to_datetime(["2021-01-01", "2021-01-03"])
data["days_to_holiday"] = pd.to_datetime(clean_data["Date"]).apply(
    lambda x: min([(h - x).days for h in holidays if h >= x], default=0)
)
data["days_after_holiday"] = pd.to_datetime(clean_data["Date"]).apply(
    lambda x: min([(x - h).days for h in holidays if h <= x], default=0)
)

# Normalize numerical columns
scaler = MinMaxScaler(feature_range=(-1, 1))
data[["Customers", "Sales", "days_to_holiday", "days_after_holiday"]] = scaler.fit_transform(
    data[["Customers", "Sales", "days_to_holiday", "days_after_holiday"]]
)

data.head()

Unnamed: 0,Date,Store,Store_Type,Store_Status,Promo,Promo2,School_Holiday,Customers,Sales,DayOfWeek,...,WeekOfYear,DayOfMonth,IsWeekend,IsMonthStart,IsMonthEnd,PromoDuration,PromoOverlap,is_weekend,days_to_holiday,days_after_holiday
0,2021-01-01,1,supermarket,1,0,0,0,0.675676,-1.0,4,...,53,1,0,1,0,0,0,0,-1.0,-1.0
1,2021-01-02,1,pharmacy,1,0,0,0,0.175676,-1.0,5,...,53,2,0,0,0,0,0,1,1.0,-0.998168
2,2021-01-03,1,supermarket,1,1,0,0,0.0,-0.132181,6,...,53,3,1,0,0,1,0,1,-1.0,-1.0
3,2021-01-04,1,supermarket,0,1,0,0,-1.0,-1.0,0,...,1,4,0,0,0,2,0,0,-1.0,-0.998168
4,2021-01-05,1,pharmacy,1,1,0,0,0.027027,0.11768,1,...,1,5,0,0,0,3,0,0,-1.0,-0.996337


In [28]:
def create_supervised_data(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        features = data.iloc[i : i + window_size][
            ["Customers", "is_weekend", "days_to_holiday", "days_after_holiday"]
        ].values
        target = data.iloc[i + window_size]["Sales"]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y)
window_size = 3
X, y = create_supervised_data(data, window_size)


In [29]:
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

In [30]:
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [31]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h_0 = torch.zeros(2, x.size(0), 50)
        c_0 = torch.zeros(2, x.size(0), 50)
        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc(out[:, -1, :])
        return out

In [32]:
model = LSTMModel(input_size=4, hidden_size=50, output_size=1)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

Epoch 1/10, Loss: 0.4432474672794342
Epoch 2/10, Loss: 0.09630876034498215
Epoch 3/10, Loss: 0.038426365703344345
Epoch 4/10, Loss: 1.1183528900146484
Epoch 5/10, Loss: 0.14064650237560272
Epoch 6/10, Loss: 0.10686271637678146
Epoch 7/10, Loss: 0.9463254809379578
Epoch 8/10, Loss: 0.02347857877612114
Epoch 9/10, Loss: 0.056759122759103775
Epoch 10/10, Loss: 0.10933373868465424


In [33]:
model.eval()

LSTMModel(
  (lstm): LSTM(4, 50, num_layers=2, batch_first=True)
  (fc): Linear(in_features=50, out_features=1, bias=True)
)

In [34]:
new_data = [[124, 1, 1, 1]]
scaled_data = scaler.transform(new_data)
input_tensor = torch.tensor(scaled_data, dtype=torch.float32).unsqueeze(1)
with torch.no_grad():
    prediction = model(input_tensor)
predicted_sales = prediction.item()
predicted_sales
original_sales = scaler.inverse_transform([[predicted_sales, 0, 0, 0]])
print(f"Predicted Sales in original scale: {original_sales[0][0]:.2f}")

Predicted Sales in original scale: 29.14




In [35]:
torch.save(model.state_dict(), 'sales_prediction_model.pth')