**Importing required libraries**

umap-learn for dimensionality reduction using UMAP

numpy for numerical operations

pandas for data manipulation

sklearn for scaling and machine learning tasks

plotly for interactive visualizations

In [13]:
pip install umap-learn



In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.graph_objs as go
import plotly.figure_factory as ff
import umap

In [9]:
# Load the beer dataset and print basic information to understand the data structure
dataset = pd.read_csv("BeerProfiles.csv")
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

# Selecting features for dimensionality reduction
# Dropping non-numeric columns like 'Name', 'Style', 'Brewery' which are not useful for analysis
X = dataset.drop(['Name','Style','Brewery'], axis = 1)
print(type(X))
print(X.shape)

                           Name    Style  \
0                         Amber  Altbier   
1                    Double Bag  Altbier   
2                Long Trail Ale  Altbier   
3                  Doppelsticke  Altbier   
4  Sleigh'r Dark Doüble Alt Ale  Altbier   

                                            Brewery  ABV  Astringency  Body  \
0                               Alaskan Brewing Co.  5.3           13    32   
1                            Long Trail Brewing Co.  7.2           12    57   
2                            Long Trail Brewing Co.  5.0           14    37   
3  Uerige Obergärige Hausbrauerei GmbH / Zum Uerige  8.5           13    55   
4                           Ninkasi Brewing Company  7.2           25    51   

   Alcohol  Bitter  Sweet  Sour  Salty  Fruits  Hoppy  Spices  Malty  
0        9      47     74    33      0      33     57       8    111  
1       18      33     55    16      0      24     35      12     84  
2        6      42     43    11      0      10 

In [10]:
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Applying PCA for initial dimensionality reduction to reduce features into two principal components
# This helps in visualizing the data in a two-dimensional space
pca = PCA(n_components = 2)
pca.fit(X_scaled)
x_pca = pca.transform(X_scaled)
print("Variance explained by each of the n_components: ",pca.explained_variance_ratio_)
print("Total variance explained by the n_components: ",sum(pca.explained_variance_ratio_))

Variance explained by each of the n_components:  [0.25423708 0.20845063]
Total variance explained by the n_components:  0.4626877093797832


Since the total variance captured by first two principal components is less than 75 percent, PCA won't be a suitable technique

In [11]:
# Implementing UMAP to visualize dataset
u = umap.UMAP(n_components = 2, n_neighbors=50, min_dist=0.1)
x_umap = u.fit_transform(X_scaled)

styles=list(dataset['Style'])
data = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=None, colorscale='Rainbow', opacity=0.5),
                                text=[f'Style: {a}' for a in styles],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
fig.show()

In [12]:
# Labelling clusters using KMeans
kmeans = KMeans(n_clusters = 6)
kmeans.fit(x_umap)

labels = list(kmeans.labels_)
data = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=kmeans.labels_, colorscale='Rainbow', opacity=0.5),
                                text=[f'Style: {a}<br>Label: {b}' for a,b in list(zip(styles,labels))],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
fig.show()
# Saving the clustered data into a new CSV for further analysis
# This can be useful for detailed exploration or external use
dataset['Label'] = kmeans.labels_
dataset.to_csv("ClusteredBeerProfiles.csv", index=False)
print(dataset.Label.value_counts())





Label
5    740
2    616
0    576
3    562
1    490
4    213
Name: count, dtype: int64
