## EECS4404 Project 2 - Wholesale Customer
#### Taswar Karim 
#### Student ID: 215095078

Importing Required packages

In [None]:
#for data handling
import pandas as pd
import numpy as np

#for data visulization in EDA step
import matplotlib.pyplot as plt
import seaborn as sns

#for data preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

#for implementing RFECV
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

#for implementing K-means model
from sklearn.cluster import KMeans

#for implementing PCA
from sklearn.decomposition import PCA

#for XGBOOST
import xgboost

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

#for Ignoring warnings in the notebook
import warnings
warnings.filterwarnings("ignore")

#other settings
sns.set()




In [None]:
#loading data to pandas dataframe
df=pd.read_csv('Wholesale customers data .csv')
df.head()


data loaded to pandas dataframe

### Exploratory Data Analysis (EDA) ###

In [None]:
df.shape

Dataset contains 440 rows and 8 columns

### Checking datatypes ###

In [None]:
df.dtypes

All columns are numeric

### Statistical Description of data ###

In [None]:
df.describe()

We can see the statistical description of each columns, which includes the mean, standerd deviation, min, max and qauntile values

### Checking missing data ###

In [None]:
df.isnull().sum()

No missing datapoint in the dataset

#### Exploring 'Channel' Column with countplot

In [None]:
sns.countplot(x='Channel', data=df);

#### Exploring 'Region' Column with countplot

In [None]:
sns.countplot(x='Region', data=df);

#### Exploring 'Fresh' Column with Histogram

In [None]:
plt.subplots(figsize=(10,6))
sns.histplot(data=df, x="Fresh", bins=30)
plt.show()

#### Exploring 'Milk' Column with Histogram

In [None]:
plt.subplots(figsize=(10,6))
sns.histplot(data=df, x="Milk", bins=30)
plt.show()

#### Exploring 'Grocery' Column with Histogram

In [None]:
plt.subplots(figsize=(10,6))
sns.histplot(data=df, x="Grocery", bins=30)
plt.show()

#### Exploring 'Frozen' Column with Histogram

In [None]:
plt.subplots(figsize=(10,6))
sns.histplot(data=df, x="Frozen", bins=30)
plt.show()

#### Exploring 'Detergents_Paper' Column with Histogram

In [None]:
plt.subplots(figsize=(10,6))
sns.histplot(data=df, x="Detergents_Paper", bins=30)
plt.show()

#### Exploring 'Delicassen' Column with Histogram

In [None]:
plt.subplots(figsize=(10,6))
sns.histplot(data=df, x="Delicassen", bins=30)
plt.show()

In [None]:
plt.figure(figsize=(15,6))
cor = df.corr()
sns.heatmap(cor, annot=True, fmt='.1g');

The heatmap shows the correlation of each column with each other. The closer the value is to 1 it means there is strong positive correlation, while the more its near to -1 it means it is strong negative correlation. Closer to 0 means no correlation.

## Feature Scaling

#### Applying MinMaxScaling

In [None]:
minmax = MinMaxScaler() 
df_minmax=minmax.fit_transform(df)
df_minmax=pd.DataFrame(columns=list(df.columns),data=df_minmax)

#### Applying Standard Scaling

In [None]:
standard = StandardScaler()
df_standard=standard.fit_transform(df)
df_standard=pd.DataFrame(columns=list(df.columns),data=df_standard)

#### Comparing both scaled methods

In [None]:
f, axes = plt.subplots(1, 2,figsize=(20,6))

sns.histplot(  x="Fresh", data=df_minmax , ax=axes[0], bins=30)
sns.histplot(  x="Fresh", data=df_standard , ax=axes[1], bins=30)
plt.show()

It can be noticed the distribution for both MinMax and Standard scaler is same. Choosing either one will not make much difference. However, I have chosen to go with MinMax Scaler as it scales in a range of 0-1.

### Seperating Dependent and Independent values

In [None]:
X = df_minmax.drop(['Channel'],axis=1)
Y = df_minmax['Channel']

## Finding Optimal Number of Feature using RFECV

In [None]:
#selecting classififier
svc = SVC(kernel="linear")

#Applying RFECV
min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(
    estimator=svc,
    step=1,
    cv=StratifiedKFold(2),
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
)

#Fitting the data
rfecv.fit(X, Y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure(figsize=(15,6))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (accuracy)")
plt.plot(
    range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select),
    rfecv.grid_scores_,
)
plt.show()

The optimal number of features according to the RFECV method is 4. 

## K-means Clustering

#### KMeans Clustering for K=2 to K=15

In [None]:
cost = []
for k in range(2, 15):
    kmeanModel = KMeans(n_clusters=k, random_state=0).fit(df_minmax)
    cost.append([k,kmeanModel.inertia_])

#### Elbow Method

In [None]:
plt.figure(figsize=(15,6))
sns.set_context('poster')
plt.plot(pd.DataFrame(cost)[0], pd.DataFrame(cost)[1])
plt.xlabel('k')
plt.ylabel('Cost')
plt.title('The Elbow Method showing the optimal k') 
plt.show()

It can be noticed that elbow could be taken at k=4. This means we can select k=4 as the optimum number of clusters.

## Implementing PCA

#### Applying PCA with Components=2 and components=4

In [None]:
#Selecting 2 PCA components
pca_2 = PCA(n_components=2)
PCA_2 = pca_2.fit_transform(df_minmax)

#Selecting 4  components
pca_4 = PCA(n_components=4)
PCA_4 = pca_4.fit_transform(df_minmax) 

#### Explaining Variance

In [None]:
print('2 Component Variance:',round(pca_2.explained_variance_ratio_.sum(),5))
print('4 Component Variance:',round(pca_4.explained_variance_ratio_.sum(),5))

It can be noticed that 0.89 variance is presented by components = 2 and 0.97 is presented by components = 4

#### Visualizing the 2 components extracted

In [None]:
plt.figure(figsize=(15,6))
principalDf = pd.DataFrame(data = PCA_2, columns = ['principal component 1', 'principal component 2'])
sns.scatterplot(data=principalDf, x="principal component 2", y="principal component 1")
plt.show()

## Implementing XGBoost

In [None]:
#Applying XGBoost model
model = xgboost.XGBClassifier(eval_metric='logloss')

kfold = StratifiedKFold(n_splits=5) #5 K fold set for here

results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%% with Standard Deviation of %.2f%%" % (results.mean()*100, results.std()*100))

The Accuracy for XGBoost is 90.23 which is very good. 