# ƒ∞REM

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

In [None]:
# 2. Data Loading
# Try to load the cleaned dataset first; fall back to the original if not found.
try:
    df = pd.read_csv('cleaned_retail_data.csv')
    print("‚úÖ Cleaned dataset loaded successfully.")
except FileNotFoundError:
    try:
        df = pd.read_csv('retail_sales_dataset.csv')
        print("‚ö†Ô∏è Warning: Cleaned data not found. Loaded original dataset.")
    except FileNotFoundError:
        print("‚ùå Error: No dataset found!")

In [None]:
# 3. Data Preprocessing
# Encode Gender (e.g., Male: 1, Female: 0)
le_gender = LabelEncoder()
df['Gender_Code'] = le_gender.fit_transform(df['Gender'])

# Process Date column if it exists to extract Month
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df['Month'] = df['Date'].dt.month

# Define Features (X) and Target (y)
# Selecting the most relevant features for classification
features = ['Age', 'Gender_Code', 'Total Amount', 'Price per Unit', 'Quantity']
if 'Month' in df.columns:
    features.append('Month')

X = df[features]
y = df['Product Category']

# Split Data (80% Training, 20% Testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining Set Size: {X_train.shape}")
print(f"Testing Set Size: {X_test.shape}")

In [None]:
# 4. Model Training (Random Forest)
print("\nTraining the model...")
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
print("‚úÖ Model training completed.")

In [None]:
# 5. Evaluation
y_pred = rf_model.predict(X_test)

# Calculate Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nüéØ Model Accuracy: {acc*100:.2f}%")

# Detailed Classification Report
print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix Visualization
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=rf_model.classes_, yticklabels=rf_model.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.show()

In [None]:
# 6. Feature Importance Visualization
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=X.columns[indices], palette="viridis")
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()