# Clustering Using Customer Transactions Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

import warnings
warnings.filterwarnings('ignore')

import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
retail_df = pd.read_csv("https://raw.githubusercontent.com/surajdwivedi0307/UnsupervisedLearning/main/retail_txns.csv")

In [None]:
retail_df.Country.value_counts()

In [None]:
retail_df.info();

In [None]:
retail_df.isnull().sum()

In [None]:
retail_df.dropna( subset = ['CustomerID'], inplace = True)

In [None]:
retail_df.info()

In [None]:
retail_df.isnull().sum()

In [None]:
retail_df.duplicated().sum()

In [None]:
retail_df.drop_duplicates(inplace = True)

In [None]:
retail_df.info();

In [None]:
sn.kdeplot(retail_df.Quantity);

In [None]:
retail_df[retail_df.Quantity < 0][0:10]

In [None]:
sn.kdeplot(retail_df.UnitPrice);

### StockCode Information

In [None]:
stock_df = retail_df[['CustomerID', 'StockCode']]

In [None]:
stock_df.drop_duplicates(inplace = True)

In [None]:
stock_df.info()

In [None]:
len(stock_df.StockCode.unique())

In [None]:
len(stock_df.CustomerID.unique())

In [None]:
stock_count_df = stock_df.StockCode.value_counts().reset_index()
stock_count_df.columns = ['StockCode', 'count']

In [None]:
plt.figure(figsize = (10, 5))
sn.histplot( stock_count_df['count']);

In [None]:
stock_count_df = stock_count_df[stock_count_df['count'] > 10]
stock_count_df

### Customer Information

In [None]:
cust_count_df = stock_df.CustomerID.value_counts().reset_index()
cust_count_df.columns = ['CustomerID', 'count']

In [None]:
plt.figure(figsize = (10, 5))
sn.histplot( cust_count_df['count']);

In [None]:
cust_count_df[cust_count_df['count'] > 10]
cust_count_df

In [None]:
clean_stock_df = stock_df[stock_df.StockCode.isin(stock_count_df.StockCode.unique()) &
                          stock_df.CustomerID.isin(cust_count_df.CustomerID.unique())]

In [None]:
clean_stock_df.info()

In [None]:
clean_stock_df['bought'] = 1.0

In [None]:
clean_stock_df

In [None]:
cust_stocks_df = clean_stock_df.pivot( index='StockCode',
                                       columns='CustomerID',
                                       values = "bought" )
cust_stocks_df[0:10]

In [None]:
cust_stocks_df.shape

In [None]:
cust_stocks_df = cust_stocks_df.fillna(0.0)
cust_stocks_df.sample(10)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
jaccard_distance = pairwise_distances(cust_stocks_df.values, metric="jaccard")

In [None]:
jaccard_distance

In [None]:
jaccard_distance.shape

In [None]:
from sklearn.cluster import HDBSCAN

In [None]:
h_cluster = HDBSCAN(min_cluster_size=2, metric = 'jaccard')

In [None]:
#h_cluster = AgglomerativeClustering(n_clusters = 5)

In [None]:
h_cluster.fit(cust_stocks_df)

In [None]:
h_cluster.labels_

In [None]:
stock_clusters = pd.DataFrame( { "StockCode" : cust_stocks_df.index,
                                 "ClusterID": h_cluster.labels_ } )

In [None]:
stock_clusters

In [None]:
stock_desc_dict = dict(zip(retail_df['StockCode'], retail_df['Description']))

In [None]:
stock_clusters['Description'] = stock_clusters.StockCode.map(lambda x: stock_desc_dict[x])

In [None]:
stock_clusters[stock_clusters.ClusterID == 0]

In [None]:
stock_clusters[stock_clusters.ClusterID == 1]

In [None]:
stock_clusters[stock_clusters.ClusterID == 2]

In [None]:
stock_clusters[stock_clusters.ClusterID == 10]

In [None]:
stock_clusters[stock_clusters.ClusterID == 12]