# Supermarket Sales: January 2019 - March 2019

# Customer Segmentation

## Preparing environment

In [4]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

## Loading dataset

In [6]:
data = pd.read_csv('cleaned_supermarket_sales.csv')

## Calculating frequency of purchases

In [8]:
data['frequency'] = data.groupby('customer')['invoice_id'].transform('count')

## Selecting features for clustering

In [10]:
features = data[['total', 'frequency']]

## Applying K-Means clustering

In [12]:
kmeans = KMeans(n_clusters=3, random_state=0)
data['cluster'] = kmeans.fit_predict(features)

## Aggregating data by cluster

In [14]:
cluster_summary = data.groupby('cluster').agg({
    'total': ['mean', 'sum'],
    'quantity': 'mean',
    'frequency': 'mean'
}).reset_index()

## Renaming columns for clarity

In [16]:
cluster_summary.columns = ['Cluster', 'Average Total Spent', 'Total Spent', 
                           'Average Quantity', 'Average Frequency']
print(cluster_summary)

   Cluster  Average Total Spent  Total Spent  Average Quantity  \
0        0           135.244664   71544.4275          3.900830   
1        1           760.690500  127796.0040          8.642900   
2        2           408.007649  123626.3175          6.561127   

   Average Frequency  
0         505.419660  
1         502.238095  
2         503.877888  
