# RFM Managerial Segmentation
Manegerial segmentation is simple and based on rules than ML or Statistiacal models.

![Rules](rfm-seg-rules.png)

In [69]:
import java.util.concurrent.TimeUnit
import scala.collection.mutable.ListBuffer
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

In [70]:
val schema = StructType(
                List(
                    StructField("customer_id", StringType, false),
                    StructField("purchase_amount", DoubleType, false),
                    StructField("date_of_purchase", DateType, false)
                )
            )
val data = spark.read
                .option("sep", "\t")
                .option("mode","FAILFAST")
                .option("dateFormat","YYYY-MM-dd")
                .schema(schema)
                .csv("../../data/foundation-marketing-analytics/purchases.txt")
                .toDF

schema = StructType(StructField(customer_id,StringType,false), StructField(purchase_amount,DoubleType,false), StructField(date_of_purchase,DateType,false))
data = [customer_id: string, purchase_amount: double ... 1 more field]


[customer_id: string, purchase_amount: double ... 1 more field]

In [71]:
def enrich(in:DataFrame, dataBaseInvoiceDate: Column) : DataFrame = {
    in
        .withColumn("end_date", dataBaseInvoiceDate)
        .withColumn("year_of_purchase", year($"date_of_purchase"))
        .withColumn("days_since", datediff($"end_date", $"date_of_purchase"))
}

val enriched1 = enrich(data, lit("2016-01-01"))
                    
enriched1.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- purchase_amount: double (nullable = true)
 |-- date_of_purchase: date (nullable = true)
 |-- end_date: string (nullable = false)
 |-- year_of_purchase: integer (nullable = true)
 |-- days_since: integer (nullable = true)



enriched1 = [customer_id: string, purchase_amount: double ... 4 more fields]


enrich: (in: org.apache.spark.sql.DataFrame, dataBaseInvoiceDate: org.apache.spark.sql.Column)org.apache.spark.sql.DataFrame


[customer_id: string, purchase_amount: double ... 4 more fields]

In [72]:
val OneYear = 365
val TwoYears = OneYear * 2
val ThreeYears = OneYear * 3

def calcRFM(in:DataFrame) : DataFrame = {
    in
        .groupBy($"customer_id")
        .agg(
            max($"days_since").alias("first_purchase"),
            min($"days_since").alias("recency"),
            count($"*").alias("frequency"),
            avg($"purchase_amount").alias("amount"))
}

val enriched2 = calcRFM(enriched1)  

enriched2.filter($"customer_id".isin("10", "90")).show(5)

+-----------+--------------+-------+---------+------+
|customer_id|first_purchase|recency|frequency|amount|
+-----------+--------------+-------+---------+------+
|         90|          3783|    758|       10| 115.8|
|         10|          3829|   3829|        1|  30.0|
+-----------+--------------+-------+---------+------+



OneYear = 365
TwoYears = 730
ThreeYears = 1095
enriched2 = [customer_id: string, first_purchase: int ... 3 more fields]


calcRFM: (in: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


[customer_id: string, first_purchase: int ... 3 more fields]

## First level segmentation
Calculates only first level segmentation

In [73]:
def firstLevelSegmentation(in:DataFrame):DataFrame = {
    in
        .withColumn("segment1", 
                        when($"recency" > ThreeYears, "inactive")
                        .when($"recency" > TwoYears && $"recency" <= ThreeYears, "cold")
                        .when($"recency" > OneYear && $"recency" <= TwoYears, "warm")
                        .otherwise("active"))
}

val segment1Level = firstLevelSegmentation(enriched2)
                    

segment1Level.groupBy($"segment1").count().show()
segment1Level.show()

+--------+-----+
|segment1|count|
+--------+-----+
|    warm| 1958|
|  active| 5398|
|    cold| 1903|
|inactive| 9158|
+--------+-----+

+-----------+--------------+-------+---------+------------------+--------+
|customer_id|first_purchase|recency|frequency|            amount|segment1|
+-----------+--------------+-------+---------+------------------+--------+
|       6240|          3752|   3005|        3| 76.66666666666667|inactive|
|      52800|          3320|   3320|        1|              15.0|inactive|
|     100140|          2750|     13|        4|             51.25|  active|
|     109180|          2616|     30|        8|             48.75|  active|
|     131450|          2228|    205|        8|            103.75|  active|
|      45300|          3667|    234|        6|29.166666666666668|  active|
|      69460|          3179|     15|        9| 28.88888888888889|  active|
|      86180|          2975|      2|        9| 21.11111111111111|  active|
|     161110|          1528|   1528|  

segment1Level = [customer_id: string, first_purchase: int ... 4 more fields]


firstLevelSegmentation: (in: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


[customer_id: string, first_purchase: int ... 4 more fields]

## Second level segmentatiom
Calculates ONLY 2nd level segmentation

In [74]:
//Make sure that the conditions for "warm new" and "active new" come eralier than other conditions with respective 
//categories for accurate results

def secondLevelSegmentation(in:DataFrame) :DataFrame = {
    in
        .withColumn("segment2",
                        when($"segment1" === lit("warm") && $"first_purchase" <= TwoYears, "warm new")
                        .when($"segment1" === lit("warm") && $"amount" >= 100, "warm high value")
                        .when($"segment1" === lit("warm") && $"amount" < 100, "warm low value")
                        .when($"segment1" === lit("active") && $"first_purchase" <= OneYear, "active new")
                        .when($"segment1" === lit("active") && $"amount" >= 100, "active high value")
                        .when($"segment1" === lit("active") && $"amount" < 100, "active low value"))
}
val segment2Level = secondLevelSegmentation(segment1Level)

segment2Level.groupBy($"segment2").count().show()
segment2Level.show()

+-----------------+-----+
|         segment2|count|
+-----------------+-----+
|  warm high value|  119|
|active high value|  573|
|             null|11061|
|         warm new|  938|
| active low value| 3313|
|       active new| 1512|
|   warm low value|  901|
+-----------------+-----+

+-----------+--------------+-------+---------+------------------+--------+-----------------+
|customer_id|first_purchase|recency|frequency|            amount|segment1|         segment2|
+-----------+--------------+-------+---------+------------------+--------+-----------------+
|       6240|          3752|   3005|        3| 76.66666666666667|inactive|             null|
|      52800|          3320|   3320|        1|              15.0|inactive|             null|
|     100140|          2750|     13|        4|             51.25|  active| active low value|
|     109180|          2616|     30|        8|             48.75|  active| active low value|
|     131450|          2228|    205|        8|            103.

segment2Level = [customer_id: string, first_purchase: int ... 5 more fields]


secondLevelSegmentation: (in: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


[customer_id: string, first_purchase: int ... 5 more fields]

In [75]:
val cols = segment1Level.schema.fieldNames.map(col(_))
cols.foreach(println)

def segmentation(segment1Level:DataFrame, segment2Level:DataFrame) :DataFrame = {
    segment1Level
        .join(segment2Level, segment1Level("customer_id") === segment2Level("customer_id"), "inner")
            .select(segment1Level("customer_id"),
                    segment1Level("first_purchase"),
                    segment1Level("recency"),
                    segment1Level("frequency"),
                    segment1Level("amount"),
                    segment1Level("segment1"),
                    segment2Level("segment2"))
            .withColumn("segment", when(segment2Level("segment2").isNotNull, $"segment2").otherwise(segment1Level("segment1")))
            .orderBy("segment")
        
}

val segmented = segmentation(segment1Level, segment2Level)

//Cache to simplify subsequent calculations
segmented.cache()

segmented.groupBy($"segment").count().show()
segmented.show()

customer_id
first_purchase
recency
frequency
amount
segment1
+-----------------+-----+
|          segment|count|
+-----------------+-----+
|active high value|  573|
| active low value| 3313|
|       active new| 1512|
|             cold| 1903|
|         inactive| 9158|
|  warm high value|  119|
|   warm low value|  901|
|         warm new|  938|
+-----------------+-----+

+-----------+--------------+-------+---------+------------------+--------+-----------------+-----------------+
|customer_id|first_purchase|recency|frequency|            amount|segment1|         segment2|          segment|
+-----------+--------------+-------+---------+------------------+--------+-----------------+-----------------+
|     131450|          2228|    205|        8|            103.75|  active|active high value|active high value|
|     189280|          1106|      1|        3|             100.0|  active|active high value|active high value|
|     170050|          1520|     13|        2|             100.0|  acti

cols = Array(customer_id, first_purchase, recency, frequency, amount, segment1)
segmented = [customer_id: string, first_purchase: int ... 6 more fields]


segmentation: (segment1Level: org.apache.spark.sql.DataFrame, segment2Level: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


[customer_id: string, first_purchase: int ... 6 more fields]

**NOTE: We can combine the calculation of both 1st and 2nd level segments into one code base but separating them simplifies testing and better maintenance**

### Profile of each segment for 2015

In [76]:
def segmentProfile(segmented: DataFrame, segColName: String) :DataFrame = {
    segmented
        .groupBy(col(segColName))
        .agg(
                round(avg($"recency"),2).alias("avg_r"),
                round(avg($"frequency"),2).alias("avg_f"),
                round(avg($"amount"),2).alias("avg_a"))
        .orderBy(col(segColName))
}

segmentProfile(segmented, "segment").show(10, truncate=false)

segmentProfile: (segmented: org.apache.spark.sql.DataFrame, segColName: String)org.apache.spark.sql.DataFrame


+-----------------+-------+-----+------+
|segment          |avg_r  |avg_f|avg_a |
+-----------------+-------+-----+------+
|active high value|88.82  |5.89 |240.05|
|active low value |108.36 |5.94 |40.72 |
|active new       |84.99  |1.05 |77.13 |
|cold             |857.78 |2.3  |51.74 |
|inactive         |2178.11|1.81 |48.11 |
|warm high value  |455.13 |4.71 |327.41|
|warm low value   |474.38 |4.53 |38.59 |
|warm new         |509.3  |1.04 |66.6  |
+-----------------+-------+-----+------+



## Segment a Database Retrospectively i.e. 2014
Taht is the segmentation of the database as if we were a **year ago**. 

**How did it work?**

The first thing to do is to remember that we are a year ago. Meaning that whatever data we take into account, anything that has happened over the last 365 days should be discarded.

We go back in time, assume the data that has been generated over the last year, for instance over the last period did not even exist. Adapt how we compute recency, frequency, monetary value and accordingly. And then we just apply everything we have applied before, same segmentation, same transformation, same analyses, and same tables.

**Why do we need to segment retrospectively?**

From a managerial point of view, it is also extremely useful to see not only to what extent each segment contributes to today's revenues. But also to what extent each segment today would likely contribute to tomorrow's revenues.

In [77]:
val customers2015 = segmented
val enriched2014 = enrich(data.filter(year($"date_of_purchase") <= 2014), lit("2015-01-01"))
val customers2014 = calcRFM(enriched2014)

val first = firstLevelSegmentation(customers2014)
segmentProfile(first, "segment1").show(10, truncate=false)

val second = secondLevelSegmentation(first)
segmentProfile(second, "segment2").show(10, truncate=false)

val segmented2014 = segmentation(first, second)

customers2015.printSchema()
customers2014.printSchema()
segmented2014.printSchema()

println("# of customers 2015: "+ customers2015.count())
println("# of customers 2014: "+ customers2014.count())

segmentProfile(segmented2014, "segment").show(10, truncate=false)

+--------+-------+-----+-----+
|segment1|avg_r  |avg_f|avg_a|
+--------+-------+-----+-----+
|active  |106.78 |4.31 |70.37|
|cold    |866.62 |2.25 |51.11|
|inactive|2058.44|1.73 |48.11|
|warm    |484.59 |2.58 |52.14|
+--------+-------+-----+-----+

+-----------------+-------+-----+------+
|segment2         |avg_r  |avg_f|avg_a |
+-----------------+-------+-----+------+
|null             |1792.95|1.85 |48.78 |
|active high value|85.34  |5.7  |261.9 |
|active low value |98.09  |5.63 |40.46 |
|active new       |132.09 |1.07 |69.73 |
|warm high value  |461.2  |4.41 |187.85|
|warm low value   |470.66 |4.36 |37.38 |
|warm new         |497.32 |1.06 |51.37 |
+-----------------+-------+-----+------+

root
 |-- customer_id: string (nullable = true)
 |-- first_purchase: integer (nullable = true)
 |-- recency: integer (nullable = true)
 |-- frequency: long (nullable = false)
 |-- amount: double (nullable = true)
 |-- segment1: string (nullable = false)
 |-- segment2: string (nullable = true)
 |-- 

customers2015 = [customer_id: string, first_purchase: int ... 6 more fields]
enriched2014 = [customer_id: string, purchase_amount: double ... 4 more fields]
customers2014 = [customer_id: string, first_purchase: int ... 3 more fields]
first = [customer_id: string, first_purchase: int ... 4 more fields]
second = [customer_id: string, first_purchase: int ... 5 more fields]
segmented2014 = [customer_id: string, first_purchase: int ... 6 more fields]


[customer_id: string, first_purchase: int ... 6 more fields]

## Revenue Generation Per Segment

In [78]:
//Compute how much revenue is generated by each segment in 2015
//Notice that people with no revenue in 2015 do NOT appear
//i.e. we select only active customers
val revenue2015 = enriched1
                    .filter($"year_of_purchase" === 2015)
                    .groupBy($"customer_id")
                    .agg(sum($"purchase_amount").alias("revenue_2015"))
revenue2015.describe("revenue_2015").show()

+-------+------------------+
|summary|      revenue_2015|
+-------+------------------+
|  count|              5398|
|   mean| 88.62432938125232|
| stddev|224.35689735796478|
|    min|               5.0|
|    max|            4500.0|
+-------+------------------+



revenue2015 = [customer_id: string, revenue_2015: double]


[customer_id: string, revenue_2015: double]

### Show avg. revenue per customers and per segment for 2015

In [79]:
//we need to do left-join so that we can bring the customers who didn't generate revenue for 2015 i.e. didnt
//make any purchases in 2015

val actuals = customers2015
                .join(revenue2015, Seq("customer_id"), "left")
                .na
                .fill(0.0, Seq("revenue_2015"))
println("No of rows: "+actuals.count())

actuals.describe("revenue_2015").show()

actuals
    .groupBy($"segment")
    .agg(round(avg($"revenue_2015"),2).alias("avg_revenue_2015"))
    .orderBy($"segment")
    .show()

No of rows: 18417
+-------+------------------+
|summary|      revenue_2015|
+-------+------------------+
|  count|             18417|
|   mean|25.975681707118422|
| stddev| 127.9801632917415|
|    min|               0.0|
|    max|            4500.0|
+-------+------------------+

+-----------------+----------------+
|          segment|avg_revenue_2015|
+-----------------+----------------+
|active high value|          323.57|
| active low value|           52.31|
|       active new|           79.17|
|             cold|             0.0|
|         inactive|             0.0|
|  warm high value|             0.0|
|   warm low value|             0.0|
|         warm new|             0.0|
+-----------------+----------------+



actuals = [customer_id: string, first_purchase: int ... 7 more fields]


[customer_id: string, first_purchase: int ... 7 more fields]

###  Show avg. revenue per customers and per segment for 2014 (FORWARD looking)
How much revenue you can expect from your active customers today (today in this data set is 2015), next year. We don't' know the future, we don't know exactly what's going to happen, but the one thing we can do, is to go back in the past (2014). And look at how much revenue we got from inactive customers in 2014, going into 2015. And that's the next step of this analysis. So what we'll do, is to merge the revenue generated in 2015, as before. But we're going to merge them with the customer list of 2014. And so we're going to look into, how much revenue's been generated by each customer, based on the segment they were in, a year ago.

And that's why we call it forward. Forward, as the segment in 2014 will enlight us, about how much revenue have been generated in 2015 from these customers.

In [80]:
//Merge 2014 customers with 2015 revenue
val forward = segmented2014
                .join(revenue2015, Seq("customer_id"), "left")
                .na
                .fill(0.0, Seq("revenue_2015"))
forward.describe("revenue_2015").show()

forward
    .groupBy($"segment")
    .agg(round(avg($"revenue_2015"),2).alias("avg_revenue_2015"))
    .orderBy($"segment")
    .show()

+-------+------------------+
|summary|      revenue_2015|
+-------+------------------+
|  count|             16905|
|   mean|21.218273883466434|
| stddev|111.24529944791601|
|    min|               0.0|
|    max|            4500.0|
+-------+------------------+

+-----------------+----------------+
|          segment|avg_revenue_2015|
+-----------------+----------------+
|active high value|          254.08|
| active low value|            41.9|
|       active new|           31.05|
|             cold|            6.11|
|         inactive|            2.95|
|  warm high value|          114.46|
|   warm low value|           13.49|
|         warm new|            5.06|
+-----------------+----------------+



forward = [customer_id: string, first_purchase: int ... 7 more fields]


[customer_id: string, first_purchase: int ... 7 more fields]

### Observations

- "Inactive" customer is 2014 has generated, on average, 3 dollar of revenue in 2015
- Many ("cold", "inactive", "warm new" & "warm low value") have generated nothig
- A "warm high value" customer generated an average, close to 114 dollars in 2015
- A "warm high value" customer is actually worth a lot more money than, for instance, a "active new" customer, who has just made one purchase, and might not remain active a year from now
- Most profitable customers
    - "Active high value" customers come first with 254
    - "warm high value" customers come second with 114, and look at that, 
    - the "active new" customers only come fourth with only an expected revenue of 31 dollars, coming in next year.
- And so, from a managerial perspective, it's really interesting to understand that a customer in the active high value segment, going in the next 12 months, is worth six times more, than say, a new active customer