# Wholesale customers Dataset

In [None]:
import numpy as np
import pandas as pd 
import seaborn as snb
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import xgboost 
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from xgboost import cv
from sklearn.preprocessing import  StandardScaler , MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA


: 

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("default", category=UserWarning)

## Explanatory Data Analysis 

In [None]:
df = pd.read_excel("Wholesale customers data.xlsx")
print ("data is read")

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.Region.value_counts()

In [None]:
df.Channel.value_counts()

In [None]:
pd.DataFrame(df).describe().head(3)

## Feature Scaling to Normalize the data

### Implementing MinMax scaling

In [None]:
scaler_MinMax = MinMaxScaler().fit_transform(df)
scaler_MinMax[:5]

In [None]:
# Convert the scaled data back to a DataFrame with the same columns
scaled_df = pd.DataFrame(scaler_MinMax, columns=df.columns)

# Now, scaled_df contains the scaled data in the same structure as df
print(scaled_df.head())

In [None]:
pd.DataFrame(scaler_MinMax).describe().head(3)

### Implementing StandardScaler

In [None]:
scaler_Standard = StandardScaler().fit_transform(df)
scaler_Standard[:5]

In [None]:
pd.DataFrame(scaler_Standard).describe().head(3)

In [None]:
# Convert the scaled data back to a DataFrame with the same columns
scaled_Standard_df = pd.DataFrame(scaler_Standard, columns=df.columns)

# Now, scaled_df contains the scaled data in the same structure as df
print(scaled_Standard_df.head())

In [None]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
# Define your features (X) and target (y)
X = scaled_df.drop(columns=['Channel'])
y = scaled_df['Channel']

## Elbow method for KMeans Clustering

In [None]:
# Define the range of K values to test 
K_values = range(2, 16)

# Initialize an empty list to store the inertia (within-cluster sum of squares) values
inertia = []

# Iterate over the K values and fit KMeans for each K
for k in K_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    # Append the inertia value to the list
    inertia.append(kmeans.inertia_)

# Plot the elbow method to identify the optimal K
plt.figure(figsize=(8, 6))
plt.plot(K_values, inertia, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()

# Identify the optimal K using the "elbow" method
diff = np.diff(inertia)
k_optimal = K_values[np.argmin(diff) + 1]

print(f"The optimal number of clusters (K) is: {k_optimal}")

## Implementing PCA

In [None]:
pca = PCA(n_components=7)  # Specify the number of components as None to retain all components
X_pca = pca.fit_transform(X)

In [None]:
X.shape

In [None]:
X_pca.shape

In [None]:
#  PCA with 2 components
pca_2c_model = PCA(n_components=2)
x_pca_2c = pca_2c_model.fit_transform(scaled_df)

# Explained variance for 2 components
explained_variance_2c = pca_2c_model.explained_variance_
explained_variance_ratio_2c = pca_2c_model.explained_variance_ratio_

print("Explained variance by the first 2 components:")
print(explained_variance_2c)
print("Explained variance ratio by the first 2 components:")
print(explained_variance_ratio_2c)

# PCA with 4 components
pca_4c_model = PCA(n_components=4)
x_pca_4c = pca_4c_model.fit_transform(scaled_df)

# Explained variance for 4 components
explained_variance_4c = pca_4c_model.explained_variance_
explained_variance_ratio_4c = pca_4c_model.explained_variance_ratio_

print("\nExplained variance by the first 4 components:")
print(explained_variance_4c)
print("Explained variance ratio by the first 4 components:")
print(explained_variance_ratio_4c)


In [None]:
# Visualize the clusters in the data 
optimal_K = 3
kmeans = KMeans(n_clusters=optimal_K, random_state=42)
kmeans.fit(X_pca)

# Visualize the clusters using the first two principal components
plt.figure(figsize=(10, 8))
for i in range(optimal_K):
    plt.scatter(X_pca[kmeans.labels_ == i, 0], X_pca[kmeans.labels_ == i, 1], label=f'Cluster {i + 1}')
plt.title('Clustering using the first 2 Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True)
plt.show()

<li>The clusters appear to be well separated, suggesting that the clustering algorithm was able to identify distinct groups in the data.The clear separation of clusters indicates that the PCA and clustering were successful in grouping similar data points together.</li>

### Split data into separate training and test set

In [None]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## k-fold Cross Validation using XGBoost 

In [None]:
%pip install xgboost 

#### Train the XGBoost Classifier

In [None]:
# declare parameters
from sklearn.metrics import classification_report


param_grid = {
    'max_depth': [3, 4, 5, 6, 8 ,10],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 150 ,200],
}

# XGBoost classifier
xgb_clf = XGBClassifier(objective='binary:logistic', alpha=10, random_state=42)

# 5-fold cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV object to find the best parameters
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring='accuracy', cv=cv)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

# Print the best parameters and accuracy
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)



#### Make predictions with XGBoost Classifier 

In [None]:
# fit the classifier to the training data
xgb_clf.fit(X_train, y_train) 

#### Make predictions with XGBoost Classifier 

In [None]:
# make predictions on test data

y_pred = xgb_clf.predict(X_test)

#### Check accuracy score and perform evaluation metrics using classification

In [None]:
# compute and print accuracy score

from sklearn.metrics import accuracy_score

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))



## Evaluation Metrics

In [None]:
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Use the best model to make predictions on the test data
y_pred = best_model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Adjust 'average' for multi-class classification
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the metrics
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1-Score:", f1)

# Save the best model to a .h5 file
model_filename = 'best_model.h5'
joblib.dump(best_model, model_filename)

print(f"Best model saved to {model_filename}")


In [None]:
grid_predictions = grid_search.predict(X_test) 

# print classification report 
print(classification_report(y_test, grid_predictions)) 

In [None]:
import h5py
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Define the file path to the saved model
h5_file_path = "best_model.h5"

# Load the model's coefficients and intercept from the .h5 file
def load_model():
    with h5py.File(h5_file_path, 'r') as h5f:
        coef_ = h5f['coef_'][:]
        intercept_ = h5f['intercept_'][:]
    return coef_, intercept_

# Function to predict the channel using loaded model parameters
def predict(input_data, coef_, intercept_):
    # Calculate the linear combination
    linear_combination = np.dot(input_data, coef_.T) + intercept_
    # Apply the logistic function to classify as 0 (Horeca) or 1 (Retail)
    prediction = (linear_combination > 0).astype(int)
    return prediction[0]

# Display 10 inferences with true and predicted labels
def display_inferences(X_test, y_test, coef_, intercept_):
    y_pred = [predict(X_test[i].reshape(1, -1), coef_, intercept_) for i in range(10)]
    print("\nSample Predictions (True vs. Predicted):")
    for i in range(10):
        true_label = 'Horeca' if y_test[i] == 0 else 'Retail'
        predicted_label = 'Horeca' if y_pred[i] == 0 else 'Retail'
        print(f"True: {true_label}, Predicted: {predicted_label}")

# Function for user to input spending values and get a prediction
def user_inference(coef_, intercept_):
    print("\nEnter the annual spending on the following categories:")
    fresh = float(input("FRESH (e.g., 30000): "))
    milk = float(input("MILK (e.g., 15000): "))
    grocery = float(input("GROCERY (e.g., 20000): "))
    frozen = float(input("FROZEN (e.g., 5000): "))
    detergents_paper = float(input("DETERGENTS_PAPER (e.g., 3000): "))
    delicatessen = float(input("DELICATESSEN (e.g., 2000): "))

    # Prepare input data and predict
    user_data = np.array([[fresh, milk, grocery, frozen, detergents_paper, delicatessen]])
    prediction = predict(user_data, coef_, intercept_)
    print("Predicted Channel:", "Horeca" if prediction == 0 else "Retail")

# Load model parameters
coef_, intercept_ = load_model()

# Load test data (replace this with actual test data if needed)
# Example test data setup
test_data = pd.DataFrame({
    'Fresh': [30000, 20000, 10000, 5000, 25000, 15000, 30000, 20000, 10000, 5000],
    'Milk': [15000, 10000, 5000, 3000, 2000, 7000, 8000, 15000, 10000, 5000],
    'Grocery': [20000, 25000, 12000, 1000, 30000, 5000, 7000, 20000, 15000, 2000],
    'Frozen': [5000, 3000, 1000, 2000, 4000, 5000, 3000, 2000, 1000, 500],
    'Detergents_Paper': [3000, 5000, 2000, 4000, 3000, 1000, 2500, 2000, 3000, 4000],
    'Delicassen': [2000, 3000, 1000, 2500, 1500, 1000, 500, 2000, 3000, 1000]
})

# Generate test labels for demonstration purposes (0: Horeca, 1: Retail)
test_labels = np.array([0, 1, 0, 1, 0, 1, 1, 0, 1, 0])

# Display sample inferences
display_inferences(test_data.values, test_labels, coef_, intercept_)

# Run user input inference
user_inference(coef_, intercept_)


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

# Load dataset
file_path = r"C:\Users\DELL\Desktop\iot-project\Wholesale customers data.xlsx"
df = pd.read_excel(file_path)
print("Data is read successfully.")
print(df.info())
print(df.head())

# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Feature scaling using MinMaxScaler
numerical_features = df.select_dtypes(include=[np.number]).columns
scaler_MinMax = MinMaxScaler().fit_transform(df[numerical_features])
scaled_df = pd.DataFrame(scaler_MinMax, columns=numerical_features)

# Elbow method for KMeans Clustering
K_values = range(2, 16)
inertia = []
for k in K_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_df.drop(columns=["Channel"]))
    inertia.append(kmeans.inertia_)

# Plot the elbow method
plt.figure(figsize=(8, 6))
plt.plot(K_values, inertia, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()

# PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(scaled_df.drop(columns=["Channel"]))

# Clustering visualization with PCA
optimal_K = 3
kmeans = KMeans(n_clusters=optimal_K, random_state=42)
labels = kmeans.fit_predict(X_pca)

plt.figure(figsize=(10, 8))
for i in range(optimal_K):
    plt.scatter(X_pca[labels == i, 0], X_pca[labels == i, 1], label=f'Cluster {i + 1}')
plt.title('Clustering using the first 2 Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True)
plt.show()

# Define features (X) and target (y)
# Define features (X) and target (y)
X = scaled_df.drop(columns=['Channel'])

# Map the target variable to binary classes (1 -> 0, 2 -> 1)
y = df['Channel'].map({1: 0, 2: 1})

# Verify the unique values in y
print("Unique values in target after mapping:", y.unique())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define XGBoost model and GridSearch parameters
param_grid = {
    'max_depth': [3, 4, 5, 6, 8, 10],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 150, 200],
}

xgb_clf = XGBClassifier(objective='binary:logistic', random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring='accuracy', cv=cv)
grid_search.fit(X_train, y_train)

# Rest of your code for training, evaluation, saving, and user inference...


# Best model and parameters
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy from cross-validation:", grid_search.best_score_)

# Make predictions and evaluate
y_pred = best_model.predict(X_test)
print("XGBoost model accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the best model
model_path = r"C:\Users\DELL\Desktop\iot-project\best_model.joblib"
joblib.dump(best_model, model_path)
print(f"Best model saved to {model_path}")

# Load and use the model for inference
loaded_model = joblib.load(model_path)

# Display sample inferences
def display_inferences(X_test, y_test):
    y_pred_sample = loaded_model.predict(X_test[:10])
    print("\nSample Predictions (True vs. Predicted):")
    for i in range(10):
        true_label = 'Horeca' if y_test.iloc[i] == 1 else 'Retail'
        predicted_label = 'Horeca' if y_pred_sample[i] == 1 else 'Retail'
        print(f"True: {true_label}, Predicted: {predicted_label}")

display_inferences(X_test, y_test)

