In [1]:
import pandas as pd
import numpy as np
import random

# Create a list of unique SKUs
sku_list = ["SKU001", "SKU002", "SKU003", "SKU004", "SKU005"]

# Create a list of dates for the entire year
dates = pd.date_range(start='2023-01-01', periods=365, freq='D')

# Generate random sales data for each SKU
data = {
    'Date': [random.choice(dates) for _ in range(365 * len(sku_list))],
    'SKU': [random.choice(sku_list) for _ in range(365 * len(sku_list))],
    'Sales': [random.randint(1000, 10000) for _ in range(365 * len(sku_list))],
    'Price': [round(random.uniform(5, 50), 2) for _ in range(365 * len(sku_list))],
    'Inventory': [random.randint(100, 1000) for _ in range(365 * len(sku_list))],
    'Category': [random.choice(['Electronics', 'Clothing', 'Home Decor', 'Toys', 'Books']) for _ in range(365 * len(sku_list))]
}

# Create a DataFrame
sales_data = pd.DataFrame(data)

# Display the first few rows of the sales data with attributes
print(sales_data.tail())


           Date     SKU  Sales  Price  Inventory    Category
1820 2023-04-24  SKU005   5612   8.48        366  Home Decor
1821 2023-04-10  SKU004   3678  12.14        804    Clothing
1822 2023-01-23  SKU003   4210  43.13        113    Clothing
1823 2023-02-02  SKU004   4303  43.30        235  Home Decor
1824 2023-10-16  SKU002   5754  37.25        707    Clothing


In [2]:
# One-Class SVM
from sklearn.svm import OneClassSVM

# Create a One-Class SVM model
clf = OneClassSVM(nu=0.05, kernel="rbf", gamma=0.1)

# Fit the model to the sales data
clf.fit(sales_data[['Sales']])

# Predict anomalies
anomalies = clf.predict(sales_data[['Sales']])



In [3]:
anomalies

array([-1,  1,  1, ...,  1,  1, -1])

In [4]:
sum(anomalies == -1)

728

In [5]:
len(sales_data)

1825

In [6]:
# DBSCAN

from sklearn.cluster import DBSCAN

# Create a DBSCAN model
dbscan = DBSCAN(eps=1000, min_samples=5)

# Fit the model to the sales data
labels = dbscan.fit_predict(sales_data[['Sales']])

# Label -1 represents outliers/anomalies
anomalies = labels == -1


In [7]:
sum(anomalies)

0

In [8]:
# K-Means Clustering

from sklearn.cluster import KMeans

# Create a K-Means model with a suitable number of clusters
kmeans = KMeans(n_clusters=5)

# Fit the model to the sales data
sales_data['Cluster'] = kmeans.fit_predict(sales_data[['Sales']])

# Anomalies are data points that don't belong to any cluster (cluster -1)
anomalies = sales_data['Cluster'] == -1


  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
sum(anomalies)

0

In [10]:
# RCF

from sklearn.ensemble import IsolationForest

# Create an Isolation Forest model
clf_iforest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model to the sales data
X = np.array(sales_data['Sales']).reshape(-1, 1)
clf_iforest.fit(X)

# Predict anomalies
outliers_iforest = clf_iforest.predict(X)

# Convert the Isolation Forest output to Boolean values
anomalies_iforest = outliers_iforest == -1


In [11]:
anomalies_iforest

array([False, False, False, ..., False, False, False])

In [12]:
sum(anomalies_iforest)

92

In [13]:
# LSTM
# For LSTM-based anomaly detection, you would typically need a time series dataset with sequences. 
# Here's a basic example of using LSTM for sequence-based anomaly detection:
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

# Create sequences of sales data
seq_length = 10
sequences = [np.array(sales_data['Sales'][i:i+seq_length]) for i in range(len(sales_data) - seq_length + 1)]

# Reshape data for LSTM
X = np.array(sequences)

# Create an LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, X, epochs=50, batch_size=32, verbose=0)

# Calculate reconstruction errors
predictions = model.predict(X)
# Calculate reconstruction errors
reconstruction_errors = np.mean(np.square(X - predictions), axis=1)

# Define a threshold for anomaly detection
threshold = np.percentile(reconstruction_errors, 95)
anomalies_lstm = reconstruction_errors > threshold


2023-11-09 13:21:24.552161: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [14]:
sequences

[array([3472, 2517, 9309, 8984, 6709, 1177, 3460, 9149, 2407, 2862]),
 array([2517, 9309, 8984, 6709, 1177, 3460, 9149, 2407, 2862, 1611]),
 array([9309, 8984, 6709, 1177, 3460, 9149, 2407, 2862, 1611, 2547]),
 array([8984, 6709, 1177, 3460, 9149, 2407, 2862, 1611, 2547, 4576]),
 array([6709, 1177, 3460, 9149, 2407, 2862, 1611, 2547, 4576, 2961]),
 array([1177, 3460, 9149, 2407, 2862, 1611, 2547, 4576, 2961, 9108]),
 array([3460, 9149, 2407, 2862, 1611, 2547, 4576, 2961, 9108, 8451]),
 array([9149, 2407, 2862, 1611, 2547, 4576, 2961, 9108, 8451, 3137]),
 array([2407, 2862, 1611, 2547, 4576, 2961, 9108, 8451, 3137, 3379]),
 array([2862, 1611, 2547, 4576, 2961, 9108, 8451, 3137, 3379, 9989]),
 array([1611, 2547, 4576, 2961, 9108, 8451, 3137, 3379, 9989, 6551]),
 array([2547, 4576, 2961, 9108, 8451, 3137, 3379, 9989, 6551, 4810]),
 array([4576, 2961, 9108, 8451, 3137, 3379, 9989, 6551, 4810, 2586]),
 array([2961, 9108, 8451, 3137, 3379, 9989, 6551, 4810, 2586, 9100]),
 array([9108, 8451, 

In [15]:
anomalies_lstm

array([False,  True,  True, ..., False, False, False])

In [16]:
sum(anomalies_lstm)

91

In [17]:
# GMM-PCA:

from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture

# Apply PCA for dimensionality reduction
pca = PCA(n_components=1)
X_pca = pca.fit_transform(sales_data[['Sales']])

# Create a Gaussian Mixture Model
gmm = GaussianMixture(n_components=2, covariance_type='full')

# Fit the model to the reduced data
gmm.fit(X_pca)

# Predict anomalies using Mahalanobis distance
mahal = gmm.score_samples(X_pca)
threshold = np.percentile(mahal, 5)
anomalies_gmm_pca = mahal < threshold


In [18]:
anomalies_gmm_pca

array([False, False, False, ..., False, False, False])

In [19]:
sum(anomalies_gmm_pca)

91

In [20]:
from keras.models import Model
from keras.layers import Input, Dense

# Create sequences of sales data
seq_length = 10
sequences = [np.array(sales_data['Sales'][i:i+seq_length]) for i in range(len(sales_data) - seq_length + 1)]

# Reshape data for Autoencoder
X = np.array(sequences)  # Shape should be (number of sequences, seq_length)

# Create an Autoencoder model
input_layer = Input(shape=(seq_length,))
encoded = Dense(10, activation='relu')(input_layer)
decoded = Dense(seq_length, activation='linear')(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Fit the model to the sales data
autoencoder.fit(X, X, epochs=50, batch_size=32, verbose=0)

# Calculate reconstruction errors
predictions = autoencoder.predict(X)
reconstruction_errors_autoencoder = np.mean(np.square(X - predictions), axis=1)

# Define a threshold for anomaly detection
threshold = np.percentile(reconstruction_errors_autoencoder, 95)
anomalies_autoencoder = reconstruction_errors_autoencoder > threshold




In [21]:
anomalies_autoencoder

array([False, False, False, ..., False, False, False])

In [22]:
sum(anomalies_autoencoder)

91

### More on LSTM


In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense


In [24]:
# Define the sequence length for LSTM
seq_length = 10

# Standardize numerical features (Price, Inventory)
scaler = StandardScaler()
sales_data['Price'] = scaler.fit_transform(sales_data['Price'].values.reshape(-1, 1))
sales_data['Inventory'] = scaler.fit_transform(sales_data['Inventory'].values.reshape(-1, 1))



sales_data

Unnamed: 0,Date,SKU,Sales,Price,Inventory,Category,Cluster
0,2023-09-27,SKU005,3472,-1.507435,0.138980,Electronics,2
1,2023-06-15,SKU003,2517,-1.229992,0.107489,Toys,0
2,2023-12-15,SKU005,9309,1.707991,1.485229,Books,3
3,2023-01-12,SKU004,8984,-0.650650,1.107335,Clothing,3
4,2023-12-03,SKU002,6709,-1.148976,0.890833,Electronics,4
...,...,...,...,...,...,...,...
1820,2023-04-24,SKU005,5612,-1.422597,-0.683728,Home Decor,4
1821,2023-04-10,SKU004,3678,-1.142862,1.040416,Clothing,2
1822,2023-01-23,SKU003,4210,1.225715,-1.679638,Clothing,2
1823,2023-02-02,SKU004,4303,1.238708,-1.199397,Home Decor,2


In [25]:
# Create sequences of data
sequences = [sales_data[['Sales', 'Price', 'Inventory']].iloc[i:i+seq_length].values for i in range(len(sales_data) - seq_length + 1)]

# Reshape data for LSTM
X = np.array(sequences)

from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

# Create an LSTM model with matching input and output shapes
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, 3), return_sequences=True))  # Updated input shape to accommodate 3 features and return sequences
model.add(Dense(3))  # Updated output shape to match input shape
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, X, epochs=50, batch_size=32, verbose=0)

# Calculate reconstruction errors
predictions = model.predict(X)
# Calculate reconstruction errors
reconstruction_errors = np.mean(np.square(X - predictions), axis=1)

# Define a threshold for anomaly detection
threshold = np.percentile(reconstruction_errors, 95)
anomalies_lstm = reconstruction_errors > threshold

# Display anomalies
anomalies_df = sales_data.iloc[seq_length - 1:][anomalies_lstm]
print(anomalies_df)


           Date     SKU  Sales     Price  Inventory     Category  Cluster
11   2023-03-09  SKU004   2547 -0.747717  -1.041941        Books        0
12   2023-05-04  SKU001   4576  0.061681  -1.041941        Books        2
13   2023-03-04  SKU001   2961  1.320489   1.217554     Clothing        0
23   2023-10-21  SKU004   7797 -1.519664  -0.467226     Clothing        1
31   2023-02-09  SKU003   6228 -0.153853  -0.502654  Electronics        4
...         ...     ...    ...       ...        ...          ...      ...
1807 2023-06-01  SKU005   5636  1.585702   1.760777  Electronics        4
1812 2023-08-11  SKU005   5958 -0.623135  -0.593191  Electronics        4
1813 2023-03-19  SKU005   9536  0.342180   1.371074   Home Decor        3
1820 2023-04-24  SKU005   5612 -1.422597  -0.683728   Home Decor        4
1822 2023-01-23  SKU003   4210  1.225715  -1.679638     Clothing        2

[273 rows x 7 columns]


The output you provided seems to be a DataFrame containing records for which the LSTM model detected anomalies. Let's break down what each column represents:

Date: The date of the sales record.

SKU: The Stock Keeping Unit, which identifies the product associated with the sales record.

Sales: The sales quantity for the specific SKU on the given date. These are the sales values where the anomalies were detected.

Price: The standardized price of the SKU for the corresponding date. It appears that the values have been scaled, and higher values may indicate a higher price compared to the SKU's historical data.

Inventory: The standardized inventory level of the SKU for the corresponding date. Similarly, the values have been scaled, and higher values may indicate higher inventory levels relative to historical data.

Category: The category of the SKU, which classifies the product into a particular group.

In the context of anomaly detection, these records are flagged as anomalies because they exhibit unusual behavior compared to the historical data for the specific SKU. The anomalies could be due to various factors such as unusually high or low sales, pricing changes, or inventory fluctuations compared to the SKU's historical patterns.

Analyzing these anomalies can help you identify specific products or categories that require further investigation or action. For example, you may want to examine why there was a significant spike or drop in sales for certain SKUs or whether there are any specific patterns in the anomalies related to SKU category.

These anomalies could potentially indicate areas where you should focus your attention to optimize sales and inventory management.

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

import pandas as pd
import numpy as np
import random

# Create a list of unique SKUs
sku_list = ["SKU001", "SKU002", "SKU003", "SKU004", "SKU005"]

# Create a list of dates for the entire year
dates = pd.date_range(start='2023-01-01', periods=365, freq='D')

# Generate random sales data for each SKU
data = {
    'Date': [random.choice(dates) for _ in range(365 * len(sku_list))],
    'SKU': [random.choice(sku_list) for _ in range(365 * len(sku_list))],
    'Sales': [random.randint(1000, 10000) for _ in range(365 * len(sku_list))],
    'Price': [round(random.uniform(5, 50), 2) for _ in range(365 * len(sku_list))],
    'Inventory': [random.randint(100, 1000) for _ in range(365 * len(sku_list))],
    'Category': [random.choice(['Electronics', 'Clothing', 'Home Decor', 'Toys', 'Books']) for _ in range(365 * len(sku_list))]
}

# Create a DataFrame
sales_data = pd.DataFrame(data)

# Display the first few rows of the sales data with attributes
print(sales_data.tail())



# Define the sequence length for LSTM
seq_length = 10

# Standardize numerical features (Price, Inventory)
scaler = StandardScaler()
sales_data['Price'] = scaler.fit_transform(sales_data['Price'].values.reshape(-1, 1))
sales_data['Inventory'] = scaler.fit_transform(sales_data['Inventory'].values.reshape(-1, 1))



# Create sequences of data
sequences = [sales_data[['Sales', 'Price', 'Inventory']].iloc[i:i+seq_length].values for i in range(len(sales_data) - seq_length + 1)]

# Reshape data for LSTM
X = np.array(sequences)

from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

# Create an LSTM model with matching input and output shapes
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, 3), return_sequences=True))  # Updated input shape to accommodate 3 features and return sequences
model.add(Dense(3))  # Updated output shape to match input shape
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, X, epochs=50, batch_size=32, verbose=0)

# Calculate reconstruction errors
predictions = model.predict(X)
# Calculate reconstruction errors
reconstruction_errors = np.mean(np.square(X - predictions), axis=1)

# Define a threshold for anomaly detection
threshold = np.percentile(reconstruction_errors, 95)
anomalies_lstm = reconstruction_errors > threshold

# Display anomalies
anomalies_df = sales_data.iloc[seq_length - 1:][anomalies_lstm]
print(anomalies_df)




           Date     SKU  Sales  Price  Inventory     Category
1820 2023-05-04  SKU005   7074  48.16        747         Toys
1821 2023-04-28  SKU003   7995  44.09        313         Toys
1822 2023-12-29  SKU001   7905  19.82        726   Home Decor
1823 2023-12-12  SKU005   8886  16.52        591   Home Decor
1824 2023-02-24  SKU004   4721   9.03        711  Electronics
           Date     SKU  Sales     Price  Inventory     Category
13   2023-01-05  SKU005   1107 -0.007654  -0.566819   Home Decor
18   2023-07-15  SKU003   7509  0.161974   1.228395  Electronics
22   2023-06-12  SKU001   5956  1.560441  -1.408446   Home Decor
28   2023-06-04  SKU002   4548  1.687854  -0.999215  Electronics
39   2023-03-28  SKU005   3293 -0.571032   1.653069   Home Decor
...         ...     ...    ...       ...        ...          ...
1783 2023-06-05  SKU001   7922  1.034673   0.209177        Books
1784 2023-03-28  SKU005   1687 -0.453598  -1.694136        Books
1786 2023-06-13  SKU001   1190 -0.727612   