#### Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#### Exploring and Understanding the Data

In [2]:
# Load the data
dataframe_Amazon_Sales_data = pd.read_csv("Amazon Sales data.csv")

In [None]:
# Display the first few rows of the data
dataframe_Amazon_Sales_data.head()

In [4]:
# Shape of the dataset

dataframe_Amazon_Sales_data.shape

# Here, the dataframe_Amazon_Sales_data dataset contains 100 rows and 14 columns.

(100, 14)

In [5]:
# Check for missing values
missing_values = dataframe_Amazon_Sales_data.isnull().sum()

# Display data types
data_types = dataframe_Amazon_Sales_data.dtypes

missing_values, data_types


(Region            0
 Country           0
 Item Type         0
 Sales Channel     0
 Order Priority    0
 Order Date        0
 Order ID          0
 Ship Date         0
 Units Sold        0
 Unit Price        0
 Unit Cost         0
 Total Revenue     0
 Total Cost        0
 Total Profit      0
 dtype: int64,
 Region             object
 Country            object
 Item Type          object
 Sales Channel      object
 Order Priority     object
 Order Date         object
 Order ID            int64
 Ship Date          object
 Units Sold          int64
 Unit Price        float64
 Unit Cost         float64
 Total Revenue     float64
 Total Cost        float64
 Total Profit      float64
 dtype: object)

#### Sales by Product Category

In [None]:
# Sales by Product Category
sales_by_category = dataframe_Amazon_Sales_data.groupby('Item Type')['Total Revenue'].sum().reset_index()

# Plotting Sales by Product Category
plt.figure(figsize=(14, 8))
sns.barplot(data=sales_by_category, x='Total Revenue', y='Item Type', palette='plasma')
plt.title('Sales by Product Category')
plt.xlabel('Total Revenue')
plt.ylabel('Category')
plt.show()

#### Advanced Feature Engineering with Visualizations

In [None]:
# Calculate the 7-day and 30-day moving averages
dataframe_Amazon_Sales_data['MA_7'] = dataframe_Amazon_Sales_data['Total Revenue'].rolling(window=7).mean()
dataframe_Amazon_Sales_data['MA_30'] = dataframe_Amazon_Sales_data['Total Revenue'].rolling(window=30).mean()

# Calculate the growth rate if it doesn't exist
# Assuming 'Growth Rate' is the percentage change in 'Total Revenue'
dataframe_Amazon_Sales_data['Growth Rate'] = dataframe_Amazon_Sales_data['Total Revenue'].pct_change() * 100

# Create subplots
fig, ax = plt.subplots(2, 1, figsize=(14, 10))

# Plot the original total revenue and moving averages
dataframe_Amazon_Sales_data[['Total Revenue', 'MA_7', 'MA_30']].plot(ax=ax[0])
ax[0].set_title('Total Revenue and Moving Averages')
ax[0].set_ylabel('Total Revenue')
ax[0].set_xlabel('Date')

# Plot the growth rate
dataframe_Amazon_Sales_data['Growth Rate'].plot(ax=ax[1])
ax[1].set_title('Growth Rate of Total Revenue')
ax[1].set_ylabel('Growth Rate')
ax[1].set_xlabel('Date')

plt.tight_layout()
plt.show()

#### Anomaly Detection with Visualizations

In [None]:
from sklearn.ensemble import IsolationForest

# Select relevant features for anomaly detection
anomaly_data = dataframe_Amazon_Sales_data[['Units Sold', 'Unit Price', 'Total Revenue']].dropna()

# Fit the Isolation Forest model
iso_forest = IsolationForest(contamination=0.01, random_state=42)
anomaly_data['Anomaly'] = iso_forest.fit_predict(anomaly_data)

# Plot the anomalies
plt.figure(figsize=(14, 7))
plt.scatter(anomaly_data.index, anomaly_data['Total Revenue'], c=anomaly_data['Anomaly'], cmap='coolwarm', alpha=0.5)
plt.title('Anomaly Detection in Total Revenue')
plt.xlabel('Date')
plt.ylabel('Total Revenue')
plt.show()

#### Clustering Analysis with Visualizations

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Select relevant features for clustering
clustering_data = dataframe_Amazon_Sales_data[['Units Sold', 'Unit Price', 'Total Revenue']].dropna()

# Standardize the features
scaler = StandardScaler()
clustering_data_scaled = scaler.fit_transform(clustering_data)

# Fit the K-means model
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(clustering_data_scaled)

# Add the cluster labels to the original dataframe
dataframe_Amazon_Sales_data.loc[clustering_data.index, 'Cluster'] = kmeans.labels_

# Plot the clusters
plt.figure(figsize=(14, 7))
plt.scatter(clustering_data_scaled[:, 0], clustering_data_scaled[:, 1], c=kmeans.labels_, cmap='viridis', alpha=0.5)
plt.title('Clusters of Products')
plt.xlabel('Standardized Units Sold')
plt.ylabel('Standardized Unit Price')
plt.show()

#### Predictive Modeling with Visualizations

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Assuming the dataframe_Amazon_Sales_data is already loaded and preprocessed

# Define features and target variable
features = dataframe_Amazon_Sales_data[['Units Sold', 'Unit Price', 'MA_7', 'MA_30', 'Growth Rate']].dropna()
target = dataframe_Amazon_Sales_data.loc[features.index, 'Total Revenue']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Fit the XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

# Make predictions
predictions = xgb_model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f'RMSE: {rmse}')

# Plot actual vs predicted
plt.figure(figsize=(14, 7))
plt.scatter(y_test, predictions, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=3)
plt.title('Actual vs Predicted Sales')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.show()

#### Seasonal Decomposition of Time Series

In [None]:
import statsmodels.api as sm

# Assuming 'Order Date' is the column with the dates
# Convert 'Order Date' to datetime
dataframe_Amazon_Sales_data['Order Date'] = pd.to_datetime(dataframe_Amazon_Sales_data['Order Date'])

# Set 'Order Date' as the index
dataframe_Amazon_Sales_data.set_index('Order Date', inplace=True)

# Resample the 'Total Revenue' by month and sum the values
monthly_revenue = dataframe_Amazon_Sales_data['Total Revenue'].resample('M').sum()

# Decompose the time series
decomposition = sm.tsa.seasonal_decompose(monthly_revenue, model='additive')
fig = decomposition.plot()
plt.show()