In [2]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m903.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=aee675a78cffeda555624443124d7ebe120f287bc518e0cff94c72cda2f8eef2
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np 
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from matplotlib.pyplot import style
style.use("ggplot")
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing

In [4]:
data = pd.read_excel('/kaggle/input/basket-analysis/Basket Analysis - Data.xlsx')
data.head()

Unnamed: 0,Transaction Date,Customer ID,Product Description
0,2014-01-01,1249,citrus fruit
1,2014-01-01,1249,coffee
2,2014-01-01,1249,italian sausage
3,2014-01-01,1249,sausage
4,2014-01-01,1381,curd


In [5]:
data["Customer ID"].nunique(),data["Customer ID"].unique()

(3898, array([1249, 1381, 1440, ..., 4755, 1963, 4565]))

In [6]:
data["Product Description"].nunique()

170

In [7]:
ID=data["Customer ID"].unique()
ID.tolist()
ID.sort()

In [8]:
item=data.groupby('Customer ID')['Product Description'].apply(list)
items=[]
for i in range(3898):
    items.append(item.iloc[i])
basket=pd.DataFrame({"item":ID,"products":items})
basket['products'] = basket['products'].astype(str)
regex = r'[\[\]/,]'
basket['products'] = basket['products'].apply(lambda x: re.sub(regex, '', x))
basket.head(2)

Unnamed: 0,item,products
0,1000,'pastry' 'salty snack' 'small milk' 'med milk'...
1,1001,'rollsbuns' 'sausage' 'small milk' 'med milk' ...


In [9]:
#concatenate multi-word product names together
basket['products'] = basket['products'].apply(lambda x: re.sub(r'(\w+)\s(\w+)', r'\1_\2', x))
basket

Unnamed: 0,item,products
0,1000,'pastry' 'salty_snack' 'small_milk' 'med_milk'...
1,1001,'rollsbuns' 'sausage' 'small_milk' 'med_milk' ...
2,1002,'frozen_vegetables' 'other_vegetables' 'butter...
3,1003,'dental_care' 'frozen_meals' 'sauces' 'rollsbu...
4,1004,'med_milk' 'pip_fruit' 'tropical_fruit' 'cling...
...,...,...
3893,4996,'salty_snack' 'tropical_fruit' 'bottled_beer' ...
3894,4997,'canned_beer' 'italian_sausage' 'large_milk' '...
3895,4998,'curd' 'rollsbuns'
3896,4999,'herbs' 'newspapers' 'semi-finished_bread' 'de...


In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(data):
    preprocessed_data = []
    for products in data:
        products = products.split()
        products = [lemmatizer.lemmatize(p) for p in products if p not in stop_words]
        preprocessed_data.append(" ".join(products))
    return preprocessed_data
    
basket['products'] = preprocess(basket['products'])


****

# Modeling

In [11]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(basket['products'])

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/122 [00:00<?, ?it/s]

In [12]:
embeddings.shape

(3898, 384)

In [14]:
# Normalize the embeddings to unit length
embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

In [71]:
# Perform AgglomerativeClustering 
agg =AgglomerativeClustering(n_clusters=None, distance_threshold=5.5)
agg.fit(embeddings)

silhouette_avg = silhouette_score(embeddings, agg.labels_)
print("For n_clusters =", len(np.unique(agg.labels_)), "The average silhouette_score is :", silhouette_avg,"\n\n")

unique_labels = np.unique(agg.labels_)
prod_cluster={}
for i in unique_labels:
    indices = np.where(agg.labels_ == i)[0]
    prod_cluster[i+1]=np.array(basket["products"])[indices].tolist()
#     print("Cluster {}: {}".format(i+1, ', '.join(np.array(basket["products"])[indices].tolist())))
#     print("\n\n")


For n_clusters = 4 The average silhouette_score is : 0.010999403 




In [76]:
# # Reduce the dimensionality of the data to 2D
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(embeddings)

# # Plot the results
# plt.figure(figsize=(12,8))

# for i in unique_labels:
#     indices = np.where(agg.labels_ == i)[0]
#     plt.scatter(basket.iloc[indices,"items"], basket.iloc[indices,"items"], label=f'Cluster {i+1}')
# plt.legend()
# plt.show()


In [None]:

# # Calculate pairwise distances
# distance_matrix = pairwise_distances(embeddings)

# # Print the top 10 nearest products for each product
# for i in range(distance_matrix.shape[0]):
#     nearest_indices = np.argsort(distance_matrix[i])[:10]
#     print("Top 10 nearest products for product", i+1)
#     for index in nearest_indices:
#         print(basket.iloc[index]['products'])


In [None]:
# def jaccard_similarity(matrix):
#     # Compute the Jaccard similarity between all pairs of products
#     n_products = matrix.shape[1]
#     jaccard_similarities = np.zeros((n_products, n_products))
#     for i in range(n_products):
#         for j in range(i, n_products):
#             intersection = np.sum(matrix[:, i] & matrix[:, j])
#             union = np.sum(matrix[:, i] | matrix[:, j])
#             jaccard_similarities[i, j] = intersection / union
#             jaccard_similarities[j, i] = jaccard_similarities[i, j]
#     return jaccard_similarities

In [None]:
# # Vectorize the data
# tfidf = TfidfVectorizer()
# tfidf_matrix = tfidf.fit_transform(basket['products']).toarray()
# features = tfidf.get_feature_names_out()
# tfidf_data = pd.DataFrame(tfidf_matrix, columns=features)
# # Convert the tf-idf matrix to a binary matrix
# binary_matrix = tfidf_matrix > 0
# # Compute the Jaccard similarity between products
# products_similarities = jaccard_similarity(binary_matrix)

# from sklearn.cluster import AgglomerativeClustering
# # n_clusters = 2

# # Perform Hierarchical Clustering
# agg =AgglomerativeClustering(n_clusters=None, distance_threshold=2.1)
# agg.fit(products_similarities)

# unique_labels = np.unique(agg.labels_)
# print(len(unique_labels), "\n")

# silhouette_avg =silhouette_score(tfidf_matrix.T, agg.labels_)
# print("For n_clusters =", len(unique_labels), "The average silhouette_score is :", silhouette_avg,"\n\n")


# # for i in unique_labels:
# #     indices = np.where(agg.labels_ == i)[0]
# #     print("Cluster {}: {}".format(i+1, ', '.join(np.array(tfidf.get_feature_names())[indices].tolist())))
# #     print("\n\n")

In [None]:
# # Reduce the dimensionality of the data to 2D
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(tfidf_matrix.T)

# # Plot the results
# plt.figure(figsize=(12,8))

# for i in unique_labels:
#     indices = np.where(agg.labels_ == i)[0]
#     plt.scatter(pca_result[indices, 0], pca_result[indices, 1], label=f'Cluster {i+1}')
# #     for j, product in enumerate(np.array(tfidf.get_feature_names())[indices.tolist()]):
# #         plt.annotate(product, (pca_result[indices, 0][j], pca_result[indices, 1][j]))
# plt.legend()
# plt.show()

In [None]:
# import plotly.colors
# import plotly.graph_objs as go
# # Reduce the dimensionality of the data to 2D
# pca = PCA(n_components=3)
# pca_result = pca.fit_transform(tfidf_matrix.T)
# fig = plt.figure(figsize=(12,7))
# ax = fig.add_subplot(111, projection='3d')
# offset = 0.1
# data = []
# for i in unique_labels:
#     indices = np.where(agg.labels_ == i)[0]
#     trace = go.Scatter3d(x=pca_result[indices, 0], y=pca_result[indices, 1], z=pca_result[indices, 2], mode='markers',
#     marker=dict(size=10, color=plotly.colors.DEFAULT_PLOTLY_COLORS[i]),
#     text=[tfidf.get_feature_names()[i] for i in indices],
#     name=f'Cluster {i+1}')
#     data.append(trace)


# layout = go.Layout(title='3D Plot',scene=dict(xaxis=dict(title='PC1'), yaxis=dict(title='PC2'), zaxis=dict(title='PC3')))
# fig = go.Figure(data=data, layout=layout)
# fig.show()

****