In [5]:
import pandas as pd
import numpy as np
import random

# Create a list of unique SKUs
sku_list = ["SKU001", "SKU002", "SKU003", "SKU004", "SKU005"]

# Create a list of dates for the entire year
dates = pd.date_range(start='2023-01-01', periods=365, freq='D')

# Generate random sales data for each SKU
data = {
    'Date': [random.choice(dates) for _ in range(365 * len(sku_list))],
    'SKU': [random.choice(sku_list) for _ in range(365 * len(sku_list))],
    'Sales': [random.randint(1000, 10000) for _ in range(365 * len(sku_list))],
    'Price': [round(random.uniform(5, 50), 2) for _ in range(365 * len(sku_list))],
    'Inventory': [random.randint(100, 1000) for _ in range(365 * len(sku_list))],
    'Category': [random.choice(['Electronics', 'Clothing', 'Home Decor', 'Toys', 'Books']) for _ in range(365 * len(sku_list))]
}

# Create a DataFrame
sales_data = pd.DataFrame(data)

# Display the first few rows of the sales data with attributes
print(sales_data.tail())


           Date     SKU  Sales  Price  Inventory    Category
1820 2023-12-28  SKU005   3255  15.96        274  Home Decor
1821 2023-06-20  SKU002   6281   9.71        170       Books
1822 2023-01-14  SKU003   2782   9.29        769  Home Decor
1823 2023-12-12  SKU002   3399  28.44        759       Books
1824 2023-12-21  SKU004   5385  34.95        782    Clothing


In [16]:
# One-Class SVM
from sklearn.svm import OneClassSVM

# Create a One-Class SVM model
clf = OneClassSVM(nu=0.05, kernel="rbf", gamma=0.1)

# Fit the model to the sales data
clf.fit(sales_data[['Sales']])

# Predict anomalies
anomalies = clf.predict(sales_data[['Sales']])



In [17]:
anomalies

array([-1,  1,  1, ...,  1,  1, -1])

In [18]:
sum(anomalies == -1)

896

In [19]:
len(sales_data)

1825

In [20]:
# DBSCAN

from sklearn.cluster import DBSCAN

# Create a DBSCAN model
dbscan = DBSCAN(eps=1000, min_samples=5)

# Fit the model to the sales data
labels = dbscan.fit_predict(sales_data[['Sales']])

# Label -1 represents outliers/anomalies
anomalies = labels == -1


In [23]:
sum(anomalies)

0

In [24]:
# K-Means Clustering

from sklearn.cluster import KMeans

# Create a K-Means model with a suitable number of clusters
kmeans = KMeans(n_clusters=5)

# Fit the model to the sales data
sales_data['Cluster'] = kmeans.fit_predict(sales_data[['Sales']])

# Anomalies are data points that don't belong to any cluster (cluster -1)
anomalies = sales_data['Cluster'] == -1


  super()._check_params_vs_input(X, default_n_init=10)


In [25]:
sum(anomalies)

0

In [11]:
# RCF

from sklearn.ensemble import IsolationForest

# Create an Isolation Forest model
clf_iforest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model to the sales data
X = np.array(sales_data['Sales']).reshape(-1, 1)
clf_iforest.fit(X)

# Predict anomalies
outliers_iforest = clf_iforest.predict(X)

# Convert the Isolation Forest output to Boolean values
anomalies_iforest = outliers_iforest == -1


In [12]:
anomalies_iforest

array([False, False,  True, ..., False, False, False])

In [13]:
sum(anomalies_iforest)

92

In [15]:
# LSTM
# For LSTM-based anomaly detection, you would typically need a time series dataset with sequences. 
# Here's a basic example of using LSTM for sequence-based anomaly detection:
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

# Create sequences of sales data
seq_length = 10
sequences = [np.array(sales_data['Sales'][i:i+seq_length]) for i in range(len(sales_data) - seq_length + 1)]

# Reshape data for LSTM
X = np.array(sequences)

# Create an LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, X, epochs=50, batch_size=32, verbose=0)

# Calculate reconstruction errors
predictions = model.predict(X)
# Calculate reconstruction errors
reconstruction_errors = np.mean(np.square(X - predictions), axis=1)

# Define a threshold for anomaly detection
threshold = np.percentile(reconstruction_errors, 95)
anomalies_lstm = reconstruction_errors > threshold




In [16]:
anomalies_lstm

array([False, False, False, ..., False, False, False])

In [17]:
sum(anomalies_lstm)

91

In [7]:
# GMM-PCA:

from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture

# Apply PCA for dimensionality reduction
pca = PCA(n_components=1)
X_pca = pca.fit_transform(sales_data[['Sales']])

# Create a Gaussian Mixture Model
gmm = GaussianMixture(n_components=2, covariance_type='full')

# Fit the model to the reduced data
gmm.fit(X_pca)

# Predict anomalies using Mahalanobis distance
mahal = gmm.score_samples(X_pca)
threshold = np.percentile(mahal, 5)
anomalies_gmm_pca = mahal < threshold


In [8]:
anomalies_gmm_pca

array([False, False,  True, ..., False, False, False])

In [9]:
sum(anomalies_gmm_pca)

92

In [18]:
from keras.models import Model
from keras.layers import Input, Dense

# Create sequences of sales data
seq_length = 10
sequences = [np.array(sales_data['Sales'][i:i+seq_length]) for i in range(len(sales_data) - seq_length + 1)]

# Reshape data for Autoencoder
X = np.array(sequences)  # Shape should be (number of sequences, seq_length)

# Create an Autoencoder model
input_layer = Input(shape=(seq_length,))
encoded = Dense(10, activation='relu')(input_layer)
decoded = Dense(seq_length, activation='linear')(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Fit the model to the sales data
autoencoder.fit(X, X, epochs=50, batch_size=32, verbose=0)

# Calculate reconstruction errors
predictions = autoencoder.predict(X)
reconstruction_errors_autoencoder = np.mean(np.square(X - predictions), axis=1)

# Define a threshold for anomaly detection
threshold = np.percentile(reconstruction_errors_autoencoder, 95)
anomalies_autoencoder = reconstruction_errors_autoencoder > threshold




In [19]:
anomalies_autoencoder

array([False,  True, False, ..., False, False, False])

In [20]:
sum(anomalies_autoencoder)

91