In [None]:
# Importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Reading csv and inspecting first 5 rows

df = pd.read_csv('mcdonalds.csv')
df.head()

In [None]:
# inspecting column names

df.columns

In [None]:
# inspecting the shape of the dataframe

df.shape

In [None]:
# checking if the dataset contains any null values

df.isna().sum()

In [None]:
# inspecting the like column

df['Like'].value_counts()

In [None]:
# inspecting the age column

df['Gender'].value_counts()

In [None]:
# inspecting the visit frequency column

df['VisitFrequency'].value_counts()

In [None]:
# Converting Yes/No to binary values

bin_cols = df.columns[:11]
for col in bin_cols:
    df[col] = df[col].map(lambda x : 1 if x == 'Yes' else 0)

In [None]:
# inspecting after the tranformation from verbal to binary

df.head()

In [None]:
# inspecting the average value of each transformed segmentation variable

for col in bin_cols:
    temp = (df[col] == 1).sum()/len(df)
    print(col + ':', round(temp, 2))

In [None]:
# describe the dataframe

df.describe()

### EDA

In [None]:
# inspect the distribution of Gender in the dataset

sns.countplot(x = df['Gender'])
plt.title('Gender Distribution')
plt.show()

In [None]:
# inspecting the distribution of age 

plt.figure(figsize = (12, 6))
sns.histplot(x = df['Age'], bins = 25, kde = True)
plt.title('Age distribution')
plt.show()

In [None]:
df['Age'].nunique()

In [None]:
# as in the dataset unqiue ages are only 54 we can also see the count plot

plt.figure(figsize = (14, 6))
sns.countplot(x = df['Age'])
plt.title('Age distribution')
plt.show()

In [None]:
# inspecting the like column

plt.figure(figsize = (12, 6))
sns.countplot(x = df['Like'])
plt.title('Like Distribution')
plt.show()

In [None]:
# inspecting visit frequency

plt.figure(figsize = (12, 6))
sns.countplot(x = df['VisitFrequency'])
plt.title('Visit Frequency Distribution')
plt.show()

In [None]:
# now lets plot the average of the segmentation varibales

temp = []
for col in bin_cols:
    temp.append(round((df[col] == 1).sum()/len(df), 2))

In [None]:
plt.figure(figsize = (12, 6))
sns.barplot(x = bin_cols, y = temp)
plt.title('Average of the segmentation variable')
plt.show()

In [None]:
# transforming like column to numeric

df['Like'] = df['Like'].replace({'I hate it!-5': '-5','I love it!+5':'+5'})
df['Like'] = df['Like'].map(lambda x : int(x))

In [None]:
# lets inspect the correlation between variables

plt.figure(figsize = (14, 8))
sns.heatmap(df.corr(), cmap = 'viridis', annot = True)
plt.show()

#### Some insights can be gained from the above heat map

1) Presence of some redundant variables like : **tasty and yummy, cheap and expensive, fattening and healthy**.

2) There is high **positive** correlation between **likeness and yummy, likeness and convenience** which makes sense.

3) There is high **negative** correlation between likeness and disgusting and also some negative correlation with **likeness and greasy, likeness and age** (Higher age people are less probable to like McDonalds).

### PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
# transforming the data by PCA and viewing it from a diffrent lens

x = df.iloc[:, :11].values

pca = PCA(n_components=11)
pc_x = pca.fit_transform(x)

pc_x

In [None]:
# converting the pc_x back to dataframe so that we can use pandas tools to visulaize eassily

pc_cols = []
for i in range(1, 12):
    pc_cols.append('PC' + str(i))
    
df_pc = pd.DataFrame(data = pc_x, columns=pc_cols)

In [None]:
# inspecting first 5 rows of the transformed segmentation variables

df_pc.head()

In [None]:
# standard devaition of each components can be seen

df_pc.describe()

In [None]:
# inspecting the propotion of explained variance by each component

pca.explained_variance_

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x = df_pc.columns, y = pca.explained_variance_)
plt.title('Variance explained')
plt.show()

In [None]:
# inspecting the explained variance ratio by each component

plt.figure(figsize=(10, 6))
sns.barplot(x = df_pc.columns, y = pca.explained_variance_ratio_)
plt.title('Variance explained Ratio')
plt.show()

#### As we can see about 50% of the variance is explained just by first two components and about 63% of the variance is explained just by first three component, this signals that we may have redundant variables which we saw in the heatmap of correlation of variables.

In [None]:
# plotting PCA1 and PCA2

plt.figure(figsize = (10 ,6))
sns.scatterplot(x = df_pc['PC1'],y = df_pc['PC2'])
plt.title('PC2 vs PC1')
plt.show()

In [None]:
# finding loadings

# PCA loadings are the coefficients of the linear combination of the original variables from which the principal components (PCs) are constructed.

# Loadings Matrix = V * (E)^0.5
# Where V is pca.components_.T and E is np.sqrt(pca.explained_variance_)

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

In [None]:
# inspecting shape of loadings

loadings.shape

In [None]:
# converting it into df and inspecting

df_loadings = np.round(pd.DataFrame(data = loadings,
                           columns = df_pc.columns,
                          index = bin_cols), 3)

In [None]:
df_loadings

In [None]:
from bioinfokit.visuz import cluster

# plotiing the 2D biplot

cluster.biplot(cscore=df_pc.values, loadings=pca.components_, labels=df.columns.values, var1=round(pca.explained_variance_ratio_[0]*100, 2),
    var2=round(pca.explained_variance_ratio_[1]*100, 2),show=True,dim=(10,5))

### Extracting Segements

Using Kmeans clustering

In [None]:
# using Kmeans clustering

from sklearn.cluster import KMeans

# we will try for many values of k from 2 to 8
cluster_interia = []
n_clusters = []

for k in range(1, 9):
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(x)
    cluster_interia.append(kmeans.inertia_)
    n_clusters.append(k)

In [None]:
# Now we plot the inertia for every cluster 

plt.figure(figsize = (10, 6))
sns.barplot(x = n_clusters, y = cluster_interia)
plt.xlabel('number of clusters')
plt.ylabel('Cluster Inertia')
plt.title('Kmeans')

In [None]:
# from above plot we can make a choice that we can divide our market into 4 segments.

# Also two way clustering can be used if the inertia falloff is not that significant.

# The way we do two way clustering is described below
# we initialize a high k and perform k means than the reperesentative of each cluster is fed to hierarchy clustering 
# in which we can see the number of segments more clearly. This is done because hierarchical clustering is expensive 
# for large datasets so we reduce all data points by their reperesentatives.

In [None]:
# using 4 segments

kmeans = KMeans(n_clusters = 4)
kmeans.fit(x)

In [None]:
# visualizing the clusters

pred = kmeans.predict(x)

plt.figure(figsize = (10, 8))

sns.scatterplot(x = df_pc['PC1'], y = df_pc['PC2'], hue = pred,
                palette=['green','orange','dodgerblue','red'])

plt.show()

**Using Hierarchical Clustering**

In [None]:
# using Hierarchical Clustering

from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering

merg = linkage(x, method="ward")
plt.figure(figsize=(25,10))
dendrogram(merg,leaf_rotation = 90)
plt.xlabel("data points")
plt.ylabel("euclidean distance")
plt.show()

In [None]:
# the above figure sugggests we can have 3 segments (long 3 vertcal lines)

agglo_clustering = AgglomerativeClustering(n_clusters = 3, metric= "euclidean", linkage = "ward")
agglo_cluster_pred = agglo_clustering.fit_predict(x)

In [None]:
# visualizing the Agglomerative clustering

# visualizing the clusters

plt.figure(figsize = (10, 8))

sns.scatterplot(x = df_pc['PC1'], y = df_pc['PC2'], hue = agglo_cluster_pred,
                palette=['green', 'dodgerblue','red'])

plt.show()

### Describing Segements

In [None]:
df['cluster'] = pred

In [None]:
df.head()

In [None]:
df['cluster'].value_counts()

In [None]:
sns.countplot(x = df['cluster'], hue = df['cluster'])
plt.show()

In [None]:
from statsmodels.graphics.mosaicplot import mosaic
from itertools import product

crosstable = pd.crosstab(df['cluster'], df['Gender'])
crosstable

In [None]:
plt.figure(figsize = (14, 10))
mosaic(crosstable.stack())
plt.show()

In [None]:
crosstable = pd.crosstab(df['cluster'], df['Like'])
crosstable

In [None]:
plt.figure(figsize = (14, 10))
mosaic(crosstable.stack())
plt.show()

In [None]:
plt.figure(figsize = (14, 7))
sns.countplot(x = df['Gender'], hue = df['cluster'])
plt.title('Distribution of Gender in diffrent clusters')
plt.show()

In [None]:
plt.figure(figsize = (14, 7))
sns.countplot(x = df['Like'], hue = df['cluster'])
plt.title('Distribution of Likeness in diffrent clusters')
plt.show()

In [None]:
plt.figure(figsize = (14, 7))
sns.countplot(x = df['VisitFrequency'], hue = df['cluster'])
plt.title('Distribution of Likeness in diffrent clusters')
plt.show()

In [None]:
plt.figure(figsize = (10, 8))
sns.boxplot(x="cluster", y="Age", data=df)
plt.show()

### Selecting the target

In [None]:
# we need to give numerical encodings to visitFrequency column
# we will provide meaningful encoding to the columns (more value means more visit)


df['VisitFrequency'].unique()

In [None]:
df['frequency'] = df['VisitFrequency'].replace({
    'Never':0,
    'Once a week':4,
    'More than once a week':5,
    'Once a month' : 3,
    'Every three months': 2,
     'Once a year': 1
})

In [None]:
df.head()

In [None]:
# describing each cluster

like_by_cluster = []
visit_by_cluster = []
female_by_cluster = []

for i in range(4):
    like_by_cluster.append(df[df['cluster'] == i]['Like'].mean())
    visit_by_cluster.append(df[df['cluster'] == i]['frequency'].mean())
    female_by_cluster.append(len(df[((df['cluster'] == i) & (df['Gender'] == 'Female'))]))
    

In [None]:
# The segment evaluation plot  is extremely simplified because only
# a small number of descriptor variables are available for the fast food data set. In
# the plot the frequency of visiting McDonald’s is plotted along the x-axis. The
# extent of liking or hating McDonald’s is plotted along the y-axis. The bubble size
# represents the percentage of female consumers

plt.figure(figsize = (8, 6))
sns.scatterplot(x = visit_by_cluster, y = like_by_cluster, hue = [0,1,2,3], s = np.array(female_by_cluster)*3, 
               palette=['Red', 'Green', 'Blue', 'Brown'])

plt.xlabel('Visit')
plt.ylabel('Like')
plt.title('Segment Evaluation Plot')
plt.show()