In [None]:
!pip install hdbscan
!pip install keybert
!pip install spacy
!python -m spacy download en_core_web_lg

In [279]:
import pandas as pd 
import pickle
with open("umap_embeddings.pkl","rb") as fd:
    umap_embeddings = pickle.load(fd)
X_full = pd.read_csv("data_full.csv")
X_unique = pd.read_csv("data_unique.csv")
len(X_unique),umap_embeddings.shape


(4208, (4208, 5))

In [283]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size = 15,min_samples=1,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [284]:
from sklearn.decomposition import PCA
import umap
#projection = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(umap_embeddings)
projection = PCA(n_components=2).fit_transform(umap_embeddings)
X_unique['x'] = projection[:,1]
X_unique['y'] = projection[:,0]
X_unique['labels'] = cluster.labels_

In [285]:
import plotly.express as px
df = px.data.iris()
fig = px.scatter(X_unique[~X_unique.labels.eq(-1)], x="x", y="y", color='labels',hover_data="Description")
fig.show()

In [286]:
X_unique.labels.value_counts()

labels
-1     817
 34     96
 85     91
 35     64
 61     63
      ... 
 11     15
 14     15
 39     15
 42     15
 70     15
Name: count, Length: 113, dtype: int64

### Keywords

In [216]:
from keybert import KeyBERT
main_cluster=64
doc = ' '.join(X_unique[X_unique.labels.eq(main_cluster)]['Description'])
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None)
keywords

[('decorations', 0.4455),
 ('decoration', 0.4329),
 ('hearts', 0.4174),
 ('bells', 0.3892),
 ('decorative', 0.3654)]

In [217]:
import spacy
nlp = spacy.load("en_core_web_lg")
keywords = nlp(doc)
for item in keywords.ents:
    print(item)

RED WOOLLY
CHRISTMAS
CONES CARNIVAL ASSORTED PARTY
CANDLEHOLDER
CHRISTMAS
ANGEL
CHRISTMAS DECORATION CHRISTMAS GINGHAM TREE CHRISTMAS GINGHAM STAR
CHRISTMAS
WOODEN
TREE WOOD
CHRISTMAS
CHRISTMAS
CHRISTMAS
CHRISTMAS
RETROSPOT
WOODEN
CHRISTMAS
CHRISTMAS
CHRISTMAS
CHRISTMAS
CHRISTMAS
CHRISTMAS
3
GARLAND
CHRISTMAS
CHRISTMAS
DEC
GARLAND
CROSSBONES
GARLAND WHITE GOOSE
EUCALYPTUS & PINECONE
CHRISTMAS
CHRISTMAS
CHRISTMAS
CHRISTMAS
CINAMMON & ORANGE WREATH WOOD STOCKING
CHRISTMAS
CHRISTMAS
CHRISTMAS
CHRISTMAS
DEC BLACK CHRISTMAS TREE 60CM
WOODEN STAR CHRISTMAS SCANDINAVIAN FINE WICKER
HONEYCOMB
GARLAND
MOBILE
CHRISTMAS
CHRISTMAS
BUTTERFLY
HONEYCOMB
GARLAND ASSORTED COLOURS SILK FAN LAVENDER
HONEYCOMB
GARLAND GREEN GOOSE
TREE GREEN GOOSE
GARLAND
MAGIC
18
FOLKART SLEIGH
CHRISTMAS
SANDALWOOD FAN
CHRISTMAS
CHRISTMAS
DEC PARTY
CONES CANDY TREE
CHRISTMAS
CHRISTMAS
TREE 30CM
20LIGHT
CHRISTMAS
NECKL36"BLACK WHITE GOOSE
HONEYCOMB
GARLAND ETCHED
RETROSPOT
ICON
GREEN FLOWER GARLAND NECKLACE RED
BUTTERFLY
J

In [218]:
from tqdm import tqdm
keywords = {}
kw_model = KeyBERT()

for label in tqdm(X_unique.labels.unique()):
    inp = ' '.join(X_unique[X_unique.labels.eq(label)]['Description'])
    words = kw_model.extract_keywords(inp, keyphrase_ngram_range=(1, 1), stop_words=None)
    keywords[label] = ' '.join([item[0] for item in  words])

100%|██████████| 66/66 [00:02<00:00, 32.55it/s]


### Data exploration

In [219]:
product_map = {row['Description']:row['labels'] for _,row in X_unique.iterrows()}
X_full['labels'] = X_full.Description.apply(lambda x:product_map[x])
X_full['keywords'] = X_full.labels.apply(lambda x:keywords[x])

#### Top 10 Countries (by samples number) vs average UnitePrice for the main cluster

In [252]:
import plotly.express as px
subset = X_full[X_full.labels.eq(main_cluster)]
top10 = subset.Country.value_counts()[:10].index.to_list()
subset = subset[subset.Country.isin(top10)]

In [221]:
fig = px.box(subset, x="Country", y="UnitPrice")
fig.update_traces(boxpoints=False) 
fig.show()

#### Top 10 Countries (by samples number) vs average quantity for the main cluster

In [222]:
fig = px.box(subset, x="Country", y="Quantity")
fig.update_traces(boxpoints=False) 
fig.show()

In [223]:
subset.groupby('Country')['Quantity'].agg('mean')


Country
Channel Islands     13.419355
Cyprus              20.547619
EIRE                21.314136
France              15.050251
Germany             21.008929
Netherlands        118.840000
Portugal            10.944444
Spain               13.663462
Switzerland         15.563380
United Kingdom      11.332571
Name: Quantity, dtype: float64

In [224]:
subset.groupby('Country')['UnitPrice'].agg('mean')


Country
Channel Islands    1.850323
Cyprus             1.811905
EIRE               1.767880
France             1.492613
Germany            1.637946
Netherlands        1.831200
Portugal           0.916111
Spain              1.279231
Switzerland        1.627183
United Kingdom     1.921016
Name: UnitPrice, dtype: float64

#### Trends over months per country for this cluster

In [261]:
subset['date'] = pd.to_datetime(subset.InvoiceDate)
subset['year_month'] = subset['date'].apply(lambda x:int(x.strftime("%Y%m")))

In [262]:
from datetime import datetime
index,UnitPrice  = zip(*subset.groupby(['year_month','Country'])['UnitPrice'].agg('mean').to_dict().items())
date,country=zip(*index)
pp = pd.DataFrame()
pp['date'] = date
pp['date'] = pp['date'].apply(lambda x:datetime.strptime(str(x),"%Y%m"))
pp['country'] = country
pp['AvgUnitPrice']=UnitPrice
fig = px.line(pp,x='date',y='AvgUnitPrice',color='country')
fig.show()

#### Let's what products are bought in United Kingdom

In [231]:
sdf = X_full[(X_full["Country"].eq("United Kingdom"))&(~X_full["labels"].eq(-1))]
sdf['keywords'].value_counts()[:10]/len(sdf)

keywords
bags bag handbag washbag backpack                    0.106049
decorations decoration hearts bells decorative       0.076032
boxes box balloons nesting pack                      0.047314
lantern decoration holders tealight lightbulb        0.040297
boxes bunting wraps christmas wrapping               0.039926
metal lounge sign bathroom toilet                    0.036380
pencils pencil crayons pens pen                      0.030661
bunnies bunny rabbits birds rabbit                   0.028251
candles candlepot candleholder candle candlestick    0.026153
union knitted tea scarf lola                         0.024648
Name: count, dtype: float64