In [1]:
# Kernel: Python 3.10.5
import os
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import TimestampType
from pyspark.sql.window import Window
from functools import reduce
from datetime import timedelta

### Data Cleaning + Feature Engineering

#### 1. Setup

In [2]:
# Optional if not already set in system env vars
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-1.8"
os.environ["SPARK_HOME"] = "C:\\spark-3.5.1-bin-hadoop3"

findspark.init()

spark = SparkSession.builder \
    .appName("DataCleaning") \
    .master("local[*]") \
    .config("spark.driver.host", "127.0.0.1") \
    .getOrCreate()


#### 2. Load data

In [4]:
df = spark.read.csv("./data/OnlineRetail.csv", header=True, inferSchema=True)
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



#### 3. Data Profiling, Anomaly Detection and Data Cleaning

##### 3.1 Check the number of colummns and rows

In [4]:
# Check the number of columns and rows
print("Number of columns: ", len(df.columns))
print("Number of rows: ", df.count())

Number of columns:  8
Number of rows:  541909


##### 3.2 Abnormal 1: Check and correct Data Type

In [5]:
# Check schema
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [6]:
# Convert datatype of InvoiceDate to timestamp
df_cleaned = df.withColumn("InvoiceDate", to_timestamp("InvoiceDate", "M/d/yyyy H:mm"))

# Re-check schema
df_cleaned.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



##### 3.3 Abnormal 2: Check and handle missing values

In [7]:
# Check for missing values in original data
missing_values = df_cleaned.select([
    count(when(col(c).isNull(), c)).alias(c) for c in df_cleaned.columns
])
missing_values.show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|        0|        0|       1454|       0|          0|        0|    135080|      0|
+---------+---------+-----------+--------+-----------+---------+----------+-------+



- Description: 1,454 --> remove since it is hard to guess the product description
- CustomerID: 135,080 --> might keep them, change `null` value into `Unknown`

In [8]:
# Delete the null values in Description
df_cleaned = df_cleaned.filter(df_cleaned.Description.isNotNull())

# Change the value in CustomerID into "Unknown" for null values
df_cleaned = df_cleaned.withColumn("CustomerID", when(df_cleaned.CustomerID.isNull(), "Unknown").otherwise(df_cleaned.CustomerID))

In [9]:
# Re-check for missing values in the cleaned data
missing_values = df_cleaned.select([
    count(when(col(c).isNull(), c)).alias(c) for c in df_cleaned.columns
])
missing_values.show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
|        0|        0|          0|       0|          0|        0|         0|      0|
+---------+---------+-----------+--------+-----------+---------+----------+-------+



##### 3.4 Abnormal 3: Check and handle duplicate rows

In [10]:
# Check the number of duplicate rows
duplicate_count = df_cleaned.count() - df_cleaned.dropDuplicates().count()
print("Number of duplicate rows: ", duplicate_count)

Number of duplicate rows:  5268


In [11]:
# Drop duplicate rows
df_cleaned = df_cleaned.dropDuplicates()

In [12]:
# Re-check the number of duplicate rows
duplicate_count = df_cleaned.count() - df_cleaned.dropDuplicates().count()
print("Number of duplicate rows: ", duplicate_count)

Number of duplicate rows:  0


##### 3.5 Abnormal 4: Handle negative Quantity and UnitPrice

**Check the number of cancelled order:** The InvoiceNo starts with C (has negative Quantity)

In [13]:
cancel = df_cleaned.filter(col("InvoiceNo").startswith("C"))
print("Number of canceled orders: ", cancel.count())
cancel.show(5)

Number of canceled orders:  9251
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|  C536825|    22617|BAKING SET SPACEB...|      -1|2010-12-02 17:27:00|     4.95|     15384|United Kingdom|
|  C537251|    22747|POPPY'S PLAYHOUSE...|      -6|2010-12-06 10:45:00|      2.1|   Unknown|United Kingdom|
|  C537805|    22197|SMALL POPCORN HOLDER|      -1|2010-12-08 13:18:00|     0.72|     15311|United Kingdom|
|  C538103|    22941|CHRISTMAS LIGHTS ...|      -2|2010-12-09 15:13:00|      8.5|     17442|United Kingdom|
|  C538768|    84378|SET OF 3 HEART CO...|     -24|2010-12-14 11:34:00|     1.25|     14829|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+-----------

**Check for invalid values in Quantity and UnitPrice:**
- Quantity < 0
- UnitPrice < 0

In [14]:
# Quantity has negative value
negative_Quantity = df_cleaned.filter(col("Quantity") < 0).count()
print("Number of negative Quantity: ", negative_Quantity)
df_cleaned.filter(col("Quantity") < 0).select("InvoiceNo", "Quantity").show(5)

Number of negative Quantity:  9725
+---------+--------+
|InvoiceNo|Quantity|
+---------+--------+
|  C536825|      -1|
|  C537251|      -6|
|  C537805|      -1|
|  C538103|      -2|
|  C538768|     -24|
+---------+--------+
only showing top 5 rows



In [15]:
# UnitPrice has negative value
negative_UnitPrice = df_cleaned.filter(col("UnitPrice") < 0).count()
print("Number of negative UnitPrice: ", negative_UnitPrice)
df_cleaned.filter(col("UnitPrice") < 0).select("InvoiceNo", "UnitPrice").show(5)

Number of negative UnitPrice:  2
+---------+---------+
|InvoiceNo|UnitPrice|
+---------+---------+
|  A563186|-11062.06|
|  A563187|-11062.06|
+---------+---------+



- We can see that cancelled orders also have the negative Quantity.
- The dataset contains 9,251 cancelled orders, while 9,725 orders have a negative quantity.
- Therefore, in the Data Cleaning section, when we remove orders with negative quantities, we also remove the cancelled orders at the same time.

**Remove abnormal Quantity and UnitPrice**

In [16]:
# Filter out negative Quantity and UnitPrice
df_cleaned = df_cleaned.filter((col("Quantity") > 0) & (col("UnitPrice") > 0))

# Check negative Quantity and UnitPrice again
print("Negative Quantity count:", df_cleaned.filter(col("Quantity") < 0).count())
print("Negative UnitPrice count:", df_cleaned.filter(col("UnitPrice") < 0).count())

# Check number of cancelled orders again
cancel = df_cleaned.filter(col("InvoiceNo").startswith("C"))
print("Number of cancelled orders: ", cancel.count())

Negative Quantity count: 0
Negative UnitPrice count: 0
Number of cancelled orders:  0


##### 3.6 Abnormal 5: Identify abnormal `StockCode`-`Description` pairs that are not actual products

**Check abnormal StockCode**

In [17]:
excluded_stockcodes = ["POST", "DOT", "M", "C2", "BANK CHARGES","S", "B", "AMAZONFEE",
                       "gift_0001_10", "gift_0001_20","gift_0001_30","gift_0001_40","gift_0001_50"]

# Identify rows with exclued StockCode
df_excluded = df_cleaned.filter(col("StockCode").isin(excluded_stockcodes))

# Show distinct excluded StockCode - Description pairs
df_excluded.select("StockCode", "Description").distinct().show(truncate=False)

+------------+----------------------------------+
|StockCode   |Description                       |
+------------+----------------------------------+
|POST        |POSTAGE                           |
|DOT         |DOTCOM POSTAGE                    |
|gift_0001_40|Dotcomgiftshop Gift Voucher �40.00|
|C2          |CARRIAGE                          |
|gift_0001_30|Dotcomgiftshop Gift Voucher �30.00|
|BANK CHARGES|Bank Charges                      |
|M           |Manual                            |
|AMAZONFEE   |AMAZON FEE                        |
|gift_0001_50|Dotcomgiftshop Gift Voucher �50.00|
|gift_0001_20|Dotcomgiftshop Gift Voucher �20.00|
|gift_0001_10|Dotcomgiftshop Gift Voucher �10.00|
|S           |SAMPLES                           |
|B           |Adjust bad debt                   |
+------------+----------------------------------+



**Handle abnormal StockCode and Description pairs that are not actual products**

In [18]:
df_cleaned = df_cleaned.filter(~col("StockCode").isin(excluded_stockcodes))

In [19]:
# Re-check the abnormal stock code
df_excluded = df_cleaned.filter(col("StockCode").isin(excluded_stockcodes))
df_excluded.select("StockCode", "Description").distinct().show(truncate=False)

+---------+-----------+
|StockCode|Description|
+---------+-----------+
+---------+-----------+



#### 4. Data cleaning results

In [20]:
# The number of rows before cleaning
rows_before_cleaning = df.count()
print(f"Number of rows before cleaning: {rows_before_cleaning}")

Number of rows before cleaning: 541909


In [21]:
# Check the number of rows after cleaning
rows_after_cleaning = df_cleaned.count()
print(f"Number of rows after cleaning: {rows_after_cleaning}")

Number of rows after cleaning: 522541


#### 5. Feature Engineering

##### 1. Ensure Correct Data Types & Calculate TotalPrice

In [28]:
# Ensure InvoiceDate is in timestamp format
df_cleaned = df_cleaned.withColumn("InvoiceDate", F.to_timestamp(F.col("InvoiceDate"), "M/d/yyyy H:mm"))

# Calculate the total price for each row
df_cleaned = df_cleaned.withColumn("TotalPrice", F.round(F.col("Quantity") * F.col("UnitPrice"), 2))

In [29]:
print("Schema of cleaned data with TotalPrice:")
df_cleaned.printSchema()
df_cleaned.show(5, truncate=False)

Schema of cleaned data with TotalPrice:
root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- TotalPrice: double (nullable = true)

+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |TotalPrice|
+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+----------+
|536384   |84755    |COLOUR GLASS T-LIGHT HOLDER HANGING|48      |2010-12-01 09:53:00|0.65     |18074     |United Kingdom|31.2      |
|536385   |22168    |ORGANISER WOOD ANTIQU

##### 2. Determine analysis date

In [None]:
# The day after the last invoice date
max_invoice_date = df_cleaned.agg(F.max("InvoiceDate")).collect()[0][0]
analysis_date = max_invoice_date + timedelta(days=1)
analysis_date_lit = F.lit(analysis_date).cast("timestamp")

print(f"\nMax Invoice Date: {max_invoice_date}")
print(f"Analysis Date: {analysis_date}")


Max Invoice Date: 2011-12-09 12:50:00
Analysis Date: 2011-12-10 12:50:00


##### Calculate RFM values per customer

In [34]:
rfm_df = df_cleaned.groupBy("CustomerID").agg(
    F.max("InvoiceDate").alias("LastPurchaseDate"),
    F.countDistinct("InvoiceNo").alias("Frequency"),
    F.sum("TotalPrice").alias("MonetaryValue")
)

# Calculate Recency in days
rfm_df = rfm_df.withColumn(
    "Recency",
    F.datediff(analysis_date_lit, F.col("LastPurchaseDate"))
)
print("\nRFM Base Values:")
rfm_df.show(5, truncate=False)



RFM Base Values:
+----------+-------------------+---------+------------------+-------+
|CustomerID|LastPurchaseDate   |Frequency|MonetaryValue     |Recency|
+----------+-------------------+---------+------------------+-------+
|15555     |2011-11-27 16:19:00|16       |4791.87           |13     |
|15574     |2011-06-15 12:01:00|4        |675.64            |178    |
|15634     |2011-11-22 16:28:00|1        |243.54999999999998|18     |
|13610     |2011-11-27 15:38:00|7        |1082.33           |13     |
|13192     |2011-09-05 14:13:00|2        |911.94            |96     |
+----------+-------------------+---------+------------------+-------+
only showing top 5 rows



##### 4. Create RFM Scores (using quartiles, so scores 1 - 4)
RFM Scoring (1 to 4) — higher score = better customer

We use NTILE(4) to split values into quartiles: rank 1 (lowest) to rank 4 (highest)

Recency:
- Fewer days = more recent = better
- ntile gives rank 1 to lowest days → we want score 4 for that
- So: R_Score = 5 - ntile_rank

Frequency:
- More orders = better
- ntile gives rank 1 to lowest frequency → we want score 1 for that
- So: F_Score = ntile_rank

Monetary:
- More money spent = better
- ntile gives rank 1 to lowest spending → we want score 1 for that
- So: M_Score = ntile_rank

In [35]:
# Quartiles for Recency (lower is better, so ascending order for ntile)
r_window = Window.orderBy(F.col("Recency").asc())
rfm_df = rfm_df.withColumn("R_Quartile", F.ntile(4).over(r_window))
rfm_df = rfm_df.withColumn("R_Score", (F.lit(5) - F.col("R_Quartile")))
rfm_df.show(5, truncate=False)

+----------+-------------------+---------+-----------------+-------+----------+-------+
|CustomerID|LastPurchaseDate   |Frequency|MonetaryValue    |Recency|R_Quartile|R_Score|
+----------+-------------------+---------+-----------------+-------+----------+-------+
|12680     |2011-12-09 12:50:00|4        |790.8100000000001|1      |1         |4      |
|16705     |2011-12-09 12:08:00|20       |14034.99         |1      |1         |4      |
|16626     |2011-12-09 11:56:00|17       |4413.1           |1      |1         |4      |
|17364     |2011-12-09 09:00:00|11       |4462.68          |1      |1         |4      |
|12518     |2011-12-09 10:13:00|5        |1840.89          |1      |1         |4      |
+----------+-------------------+---------+-----------------+-------+----------+-------+
only showing top 5 rows



In [36]:
# Quartiles for Frequency (higher is better, so ascending order for ntile)
f_window = Window.orderBy(F.col("Frequency").asc())
rfm_df = rfm_df.withColumn("F_Quartile", F.ntile(4).over(f_window))
rfm_df = rfm_df.withColumn("F_Score", F.col("F_Quartile"))
rfm_df.show(5, truncate=False)

+----------+-------------------+---------+-------------+-------+----------+-------+----------+-------+
|CustomerID|LastPurchaseDate   |Frequency|MonetaryValue|Recency|R_Quartile|R_Score|F_Quartile|F_Score|
+----------+-------------------+---------+-------------+-------+----------+-------+----------+-------+
|12713     |2011-12-09 12:16:00|1        |794.55       |1      |1         |4      |1         |1      |
|13436     |2011-12-08 10:33:00|1        |196.89       |2      |1         |4      |1         |1      |
|15520     |2011-12-08 10:58:00|1        |343.5        |2      |1         |4      |1         |1      |
|13298     |2011-12-08 13:11:00|1        |360.0        |2      |1         |4      |1         |1      |
|14569     |2011-12-08 14:58:00|1        |227.39       |2      |1         |4      |1         |1      |
+----------+-------------------+---------+-------------+-------+----------+-------+----------+-------+
only showing top 5 rows



In [37]:
# Quartiles for Monetary (higher is better, so ascending order for ntile)
m_window = Window.orderBy(F.col("MonetaryValue").asc())
rfm_df = rfm_df.withColumn("M_Quartile", F.ntile(4).over(m_window))
rfm_df = rfm_df.withColumn("M_Score", F.col("M_Quartile"))

In [38]:
print("\nRFM with Scores:")
rfm_df.select("CustomerID", "Recency", "Frequency", "MonetaryValue", "R_Score", "F_Score", "M_Score").show(truncate=False)


RFM with Scores:
+----------+-------+---------+-------------+-------+-------+-------+
|CustomerID|Recency|Frequency|MonetaryValue|R_Score|F_Score|M_Score|
+----------+-------+---------+-------------+-------+-------+-------+
|16738     |298    |1        |3.75         |1      |2      |1      |
|16454     |65     |1        |5.9          |2      |1      |1      |
|14792     |64     |1        |6.2          |2      |1      |1      |
|17956     |250    |1        |12.75        |1      |2      |1      |
|16878     |85     |1        |13.3         |2      |1      |1      |
|13307     |121    |1        |15.0         |2      |1      |1      |
|17763     |264    |1        |15.0         |1      |2      |1      |
|16093     |107    |1        |17.0         |2      |1      |1      |
|16953     |31     |1        |20.8         |3      |1      |1      |
|17986     |57     |1        |20.8         |2      |1      |1      |
|16257     |177    |1        |21.95        |1      |1      |1      |
|18268     |135 

##### 5. Combine RFM Scores into a single string

In [40]:
rfm_final_df = rfm_df.withColumn(
    "RFM_Score_Concat",
    F.concat(F.col("R_Score").cast("string"), F.col("F_Score").cast("string"), F.col("M_Score").cast("string"))
)

print("\nRFM with Concatenated Scores:")
rfm_final_df.select("CustomerID", "R_Score", "F_Score", "M_Score", "RFM_Score_Concat").show(5, truncate=False)


RFM with Concatenated Scores:
+----------+-------+-------+-------+----------------+
|CustomerID|R_Score|F_Score|M_Score|RFM_Score_Concat|
+----------+-------+-------+-------+----------------+
|16738     |1      |2      |1      |121             |
|16454     |2      |1      |1      |211             |
|14792     |2      |1      |1      |211             |
|17956     |1      |2      |1      |121             |
|16878     |2      |1      |1      |211             |
+----------+-------+-------+-------+----------------+
only showing top 5 rows



##### 6. Define Customer Segments


For scores 1-4, where 4 is best:
- Champions: R=4, F=4, M=4 (Best Customers)
- Loyal Customers: R >= 3, F >= 3, M >= 3 (but not Champions)
- Potential Loyalists: Recent customers with good frequency or monetary value.
- New Customers: High R (recent), low F, M.
- Promising: Recent, but low F/M, or average R with high F/M.
- Need Attention: Average R, F, M. Or R > 2, F < 3, M < 3.
- At Risk: Purchased long ago, but were frequent/valuable. Or R < 3, F >=3, M >=3.
- Can't Lose Them: Made big purchases, and were frequent, but haven't returned for a long time. (e.g., R=1, F=4, M=4)
- Hibernating: Low F, M and low R (not recent).
- Lost: Lowest R, F, M scores.

In [41]:
segment_col = (
    F.when((F.col("R_Score") == 4) & (F.col("F_Score") == 4) & (F.col("M_Score") == 4), "Champions")
    .when((F.col("R_Score") >= 3) & (F.col("F_Score") >= 3) & (F.col("M_Score") >= 3), "Loyal Customers") # Excludes Champions
    .when((F.col("R_Score") == 4) & (F.col("F_Score") >= 2) & (F.col("M_Score") >= 2), "Potential Loyalists") # Recent, decent F/M
    .when((F.col("R_Score") == 4) & (F.col("F_Score") == 1) & (F.col("M_Score") == 1), "New Customers")
    .when((F.col("R_Score") == 1) & (F.col("F_Score") >= 3) & (F.col("M_Score") >= 3), "Season customer") # High F/M but not recent
    .when((F.col("R_Score") <= 2) & (F.col("F_Score") >= 2) & (F.col("M_Score") >= 2), "At Risk") # Not recent, but were good
    .when((F.col("R_Score") >= 3) & (F.col("F_Score") <= 2) & (F.col("M_Score") <= 2), "Promising") # Recent, but low F/M
    .when((F.col("R_Score") <= 2) & (F.col("F_Score") <= 2) & (F.col("M_Score") <= 2), "Hibernating/Lost")
    .otherwise("Need Attention") # General catch-all for average customers
)

rfm_final_df = rfm_final_df.withColumn("Segment", segment_col)
print("\nFinal RFM Segmentation:")
rfm_final_df.select("CustomerID", "Recency", "Frequency", "MonetaryValue", "R_Score", "F_Score", "M_Score", "RFM_Score_Concat", "Segment").show(truncate=False)


Final RFM Segmentation:
+----------+-------+---------+-------------+-------+-------+-------+----------------+----------------+
|CustomerID|Recency|Frequency|MonetaryValue|R_Score|F_Score|M_Score|RFM_Score_Concat|Segment         |
+----------+-------+---------+-------------+-------+-------+-------+----------------+----------------+
|16738     |298    |1        |3.75         |1      |2      |1      |121             |Hibernating/Lost|
|16454     |65     |1        |5.9          |2      |1      |1      |211             |Hibernating/Lost|
|14792     |64     |1        |6.2          |2      |1      |1      |211             |Hibernating/Lost|
|17956     |250    |1        |12.75        |1      |2      |1      |121             |Hibernating/Lost|
|16878     |85     |1        |13.3         |2      |1      |1      |211             |Hibernating/Lost|
|13307     |121    |1        |15.0         |2      |1      |1      |211             |Hibernating/Lost|
|17763     |264    |1        |15.0         |1   

In [43]:
# Analyze the distribution of customers across segments
print("\nCustomer Count per Segment:")
rfm_final_df.groupBy("Segment").count().orderBy(F.desc("count")).show()


Customer Count per Segment:
+-------------------+-----+
|            Segment|count|
+-------------------+-----+
|   Hibernating/Lost|  945|
|            At Risk|  928|
|    Loyal Customers|  808|
|          Champions|  489|
|          Promising|  453|
|     Need Attention|  351|
|Potential Loyalists|  172|
|    Season customer|  130|
|      New Customers|   59|
+-------------------+-----+



In [None]:
# Average R, F, M per segment
print("\nAverage RFM Values per Segment:")
rfm_final_df.groupBy("Segment").agg(
    F.round(F.avg("Recency"), 1).alias("Avg_Recency"),
    F.round(F.avg("Frequency"), 1).alias("Avg_Frequency"),
    F.round(F.avg("MonetaryValue"), 2).alias("Avg_Monetary")
).orderBy(F.desc("Avg_Monetary")).show(truncate=False)


Average RFM Values per Segment:
+-------------------+-----------+-------------+------------+
|Segment            |Avg_Recency|Avg_Frequency|Avg_Monetary|
+-------------------+-----------+-------------+------------+
|Champions          |7.8        |18.2         |12059.41    |
|Loyal Customers    |23.2       |5.8          |2432.3      |
|Season customer    |219.5      |3.8          |2152.72     |
|Potential Loyalists|9.5        |2.5          |1642.62     |
|At Risk            |142.4      |3.1          |1232.48     |
|Need Attention     |87.3       |2.1          |872.73      |
|Promising          |29.3       |1.4          |316.42      |
|Hibernating/Lost   |184.8      |1.1          |231.75      |
|New Customers      |10.3       |1.0          |170.77      |
+-------------------+-----------+-------------+------------+

