In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
from gensim.parsing.preprocessing import remove_stopwords
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
import nltk



df = pd.read_csv('pricerunner_aggregate.csv')

In [2]:
df.columns = ['ID', 'Title', 'VendorID', 'ClusterID', 'ClusterLabel', 'CategoryID', 'CategoryLabel']
# df.head()

In [3]:
# df.info() # no NULLs
def show_columns_values(df):
    print('ID, number of unique values ', df.ID.nunique())
    print(df.ID.value_counts())
    print('\n############################################################### \n###############################################################\n')

    print('Title, number of unique values ', df.Title.nunique())
    print(df.Title.value_counts())
    print('\n############################################################### \n###############################################################\n')

    print('ClusterLabel, number of unique values ', df.ClusterLabel.nunique())
    print(df.ClusterLabel.value_counts())
    print('\n############################################################### \n###############################################################\n')

    print('CategoryLabel, number of unique values ', df.CategoryLabel.nunique())
    print(df.CategoryLabel.value_counts())
    print('\n############################################################### \n###############################################################\n')

def show_unique_values_IDs_Labels(df):
    print('ClusterID: ', df.ClusterID.nunique(), '\nClusterLabel: ', df.ClusterLabel.nunique())
    print('CategoryID: ', df.CategoryID.nunique(), '\nCategoryLabel: ', df.CategoryLabel.nunique())

# show_columns_values(df) # 35310 unique rows, 30992 unique titles, 13233 unique clusters, 10 categories

# show_unique_values_IDs_Labels(df) # ClusterID i ClusterLabel are not identical, there are more unique ClusterID rows, but only ClusterLabel brings us information we could use

# df[df['Title'] == 'washing machine'] # seems that ClusterLabel seems more appriopriate to our models than Title, as ClusterLabel is better organized
# and in this case even more accurate

for i in range(35):
    print(df.ClusterLabel[1000*i:1000*i + 50])

# We pick ClusterLabel column as data that will be used to clustering

0     Apple iPhone 8 Plus 64GB
1     Apple iPhone 8 Plus 64GB
2     Apple iPhone 8 Plus 64GB
3     Apple iPhone 8 Plus 64GB
4     Apple iPhone 8 Plus 64GB
5     Apple iPhone 8 Plus 64GB
6     Apple iPhone 8 Plus 64GB
7     Apple iPhone 8 Plus 64GB
8     Apple iPhone 8 Plus 64GB
9     Apple iPhone 8 Plus 64GB
10    Apple iPhone 8 Plus 64GB
11    Apple iPhone 8 Plus 64GB
12    Apple iPhone 8 Plus 64GB
13    Apple iPhone 8 Plus 64GB
14    Apple iPhone 8 Plus 64GB
15    Apple iPhone 8 Plus 64GB
16    Apple iPhone 8 Plus 64GB
17    Apple iPhone 8 Plus 64GB
18    Apple iPhone 8 Plus 64GB
19    Apple iPhone 8 Plus 64GB
20    Apple iPhone 8 Plus 64GB
21    Apple iPhone 8 Plus 64GB
22    Apple iPhone 7 Plus 32GB
23    Apple iPhone 7 Plus 32GB
24    Apple iPhone 7 Plus 32GB
25    Apple iPhone 7 Plus 32GB
26    Apple iPhone 7 Plus 32GB
27    Apple iPhone 7 Plus 32GB
28    Apple iPhone 7 Plus 32GB
29    Apple iPhone 7 Plus 32GB
30    Apple iPhone 7 Plus 32GB
31    Apple iPhone 7 Plus 32GB
32    Ap

#### Usunięcie niepotrzebnych kolumn (zostawiamy tylko kolumnę ClusterLabel, na której będziemy przeprowadzali proces klasteryzacji)

In [4]:
df.drop(['ID','Title','VendorID','ClusterID','CategoryID','CategoryLabel'],axis=1,inplace=True)
df.head(20)

Unnamed: 0,ClusterLabel
0,Apple iPhone 8 Plus 64GB
1,Apple iPhone 8 Plus 64GB
2,Apple iPhone 8 Plus 64GB
3,Apple iPhone 8 Plus 64GB
4,Apple iPhone 8 Plus 64GB
5,Apple iPhone 8 Plus 64GB
6,Apple iPhone 8 Plus 64GB
7,Apple iPhone 8 Plus 64GB
8,Apple iPhone 8 Plus 64GB
9,Apple iPhone 8 Plus 64GB


##### Powyżej, mogliśmy zbadać, że słowa są odzielone spacją - nie pojawiają się przecinki, słowa 'and' oraz 'the' (nie musimy nic z tym robić)

#### Natomiast pojawiają się duże litery i odmiana pewnych słów - pozbywamy się tego problemu poniżej

In [5]:
def stemSentence(sentence):
    porter = PorterStemmer()
    token_words = word_tokenize(sentence)
    stem_sentence = [porter.stem(word) for word in token_words]
    return ' '.join(stem_sentence)

df['ClusterLabel'] = df['ClusterLabel'].apply(lambda text: stemSentence(text))
df.head(100)

Unnamed: 0,ClusterLabel
0,appl iphon 8 plu 64gb
1,appl iphon 8 plu 64gb
2,appl iphon 8 plu 64gb
3,appl iphon 8 plu 64gb
4,appl iphon 8 plu 64gb
...,...
95,appl iphon x 64gb
96,appl iphon x 64gb
97,appl iphon x 64gb
98,appl iphon x 64gb


In [6]:
# 1. Preprocessing: data is already cleared, there are no nulls, all words are meaningfull and standardized

# 2. Feature extraction: Convert the preprocessed text data into numerical features that can be used by ML algorithms. Common approaches include:
# Bag-of-Words: Represent each product description as a vector where each dimension corresponds to a unique word in the corpus, and the value represents the frequency or presence of that word in the description.
# TF-IDF (Term Frequency-Inverse Document Frequency): Weigh the word frequencies by their inverse document frequency to highlight the importance of rare words in distinguishing products.

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word')

X = vectorizer.fit_transform(df['ClusterLabel'])

df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print(df.head())
print('---------------------------------------')
print(df_tfidf.head())

            ClusterLabel
0  appl iphon 8 plu 64gb
1  appl iphon 8 plu 64gb
2  appl iphon 8 plu 64gb
3  appl iphon 8 plu 64gb
4  appl iphon 8 plu 64gb
---------------------------------------
    00  001  00ghz   01   02   03  04010   05  05ghz  06ghz  ...  zwi71401wa  \
0  0.0  0.0    0.0  0.0  0.0  0.0    0.0  0.0    0.0    0.0  ...         0.0   
1  0.0  0.0    0.0  0.0  0.0  0.0    0.0  0.0    0.0    0.0  ...         0.0   
2  0.0  0.0    0.0  0.0  0.0  0.0    0.0  0.0    0.0    0.0  ...         0.0   
3  0.0  0.0    0.0  0.0  0.0  0.0    0.0  0.0    0.0    0.0  ...         0.0   
4  0.0  0.0    0.0  0.0  0.0  0.0    0.0  0.0    0.0    0.0  ...         0.0   

   zwj14591w  zwm496w  zwm696w  zwt71201wa  zwt7142wa  zwy61223ki  zwy61225wi  \
0        0.0      0.0      0.0         0.0        0.0         0.0         0.0   
1        0.0      0.0      0.0         0.0        0.0         0.0         0.0   
2        0.0      0.0      0.0         0.0        0.0         0.0         0.0   
3    

In [7]:
# elbow = KElbowVisualizer(KMeans(), k=np.arange(100,900,100))
# elbow.fit(X.toarray())
# elbow.show();

In [8]:

# 3. Dimensionality reduction (optional): If the feature space is high-dimensional, you might want to apply dimensionality reduction techniques like Principal Component Analysis (PCA) or t-SNE to reduce the number of features while preserving the most important information.

# 4. Clustering algorithm selection: Choose an appropriate clustering algorithm based on the nature of your data and requirements. Some common clustering algorithms include:
# K-means: Partition data into k clusters by minimizing the sum of squared distances between data points and their cluster centroids.
# DBSCAN: Clusters data based on density and identifies core samples and outliers.
# Hierarchical clustering: Builds a hierarchy of clusters by iteratively merging or splitting them.

# 5. Model training and clustering: Apply the selected clustering algorithm to your preprocessed and transformed text data. Fit the clustering model to your data and generate clusters based on the algorithm's rules.

# 6. Cluster evaluation (optional): Assess the quality of the generated clusters using internal or external validation metrics. However, keep in mind that clustering is an unsupervised task, so there are no ground truth labels for evaluation. Evaluation is often subjective and domain-specific.



In [9]:
df_tfidf.head(500)

Unnamed: 0,00,001,00ghz,01,02,03,04010,05,05ghz,06ghz,...,zwi71401wa,zwj14591w,zwm496w,zwm696w,zwt71201wa,zwt7142wa,zwy61223ki,zwy61225wi,zylo,zzv634w
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
def count_wcss_scores(X, k_max):
    #  WCSS = within-cluster sum of squares
    scores = []
    for k in range(1, k_max+1):
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(X)
        wcss = kmeans.score(X) * -1 # score returns -WCSS
        scores.append(wcss)
    return scores

In [11]:
# from sklearn.cluster import KMeans
# import matplotlib.pyplot as plt


# wcss_vec = count_wcss_scores(df_tfidf, 30)
# x_ticks = list(range(1, len(wcss_vec) + 1))
# plt.plot(x_ticks, wcss_vec, 'bx-')
# plt.xlabel('k')
# plt.ylabel('Within-cluster sum of squares')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()

In [None]:
kmeans = KMeans(n_clusters=300)
kmeans.fit(X)
result = pd.concat([df['ClusterLabel'],pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())],axis=1)
result['cluster'] = kmeans.predict(X)



In [None]:
result.head(500)

In [None]:
pd.set_option('display.max_rows', 100000)


In [None]:
# #Label each cluster with the word(s) that all of its food names have in common
# clusters = result['cluster'].unique()
# labels = []
# for i in range(len(clusters)):
#     subset = result[result['cluster'] == clusters[i]]
#     words = ' '.join([x for x in np.where(subset.all()!=0,subset.columns,None) if x and x!='Name' and x!='cluster' and len(x.split()) == 1])
#     labels.append(words)
# labels_table = pd.DataFrame(zip(clusters,labels),columns=['cluster','label'])
# result_labelled = pd.merge(result,labels_table,on='cluster',how='left')
# result_labelled.head(100000)

In [None]:
clusters = result['cluster'].unique()
labels = []
for i in range(len(clusters)):
    subset = result[result['cluster'] == clusters[i]]
    words = ' '.join([x for x in np.where(subset.all()!=0,subset.columns,None) if x and x!='Name' and x!='cluster' and x != 'ClusterLabel' and len(x.split()) == 1])
    labels.append(words)
labels_table = pd.DataFrame(zip(clusters,labels),columns=['cluster','label'])
result_labelled = pd.merge(result,labels_table,on='cluster',how='left')
result_labelled.head(100000)


In [None]:
result_labelled[["ClusterLabel",'cluster','label']].head(10000)

In [None]:
print(result_labelled['label'].unique())
