# Week 1 - Foundations of marketing analytics
https://www.coursera.org/learn/foundations-marketing-analytics/home/week/1

In [1]:
import org.apache.spark.sql.types._

val schema = StructType(
                List(
                    StructField("customer_id", StringType, false),
                    StructField("purchase_amount", DoubleType, false),
                    StructField("date_of_purchase", DateType, false)
                )
            )
val data = spark.read
                .option("sep", "\t")
                .option("mode","FAILFAST")
                .option("dateFormat","YYYY-MM-dd")
                //.option("inferSchema", true) //requires 1 pass over schema and parses date as full timestamp
                .schema(schema)
                .csv("../../data/foundation-marketing-analytics/purchases.txt")
                .toDF
data.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- purchase_amount: double (nullable = true)
 |-- date_of_purchase: date (nullable = true)



schema = StructType(StructField(customer_id,StringType,false), StructField(purchase_amount,DoubleType,false), StructField(date_of_purchase,DateType,false))
data = [customer_id: string, purchase_amount: double ... 1 more field]


[customer_id: string, purchase_amount: double ... 1 more field]

In [2]:
data.show(5)
val desc = data.describe()
desc.show()

+-----------+---------------+----------------+
|customer_id|purchase_amount|date_of_purchase|
+-----------+---------------+----------------+
|        760|           25.0|      2009-11-06|
|        860|           50.0|      2012-09-28|
|       1200|          100.0|      2005-10-25|
|       1420|           50.0|      2009-07-09|
|       1940|           70.0|      2013-01-25|
+-----------+---------------+----------------+
only showing top 5 rows

+-------+------------------+------------------+
|summary|       customer_id|   purchase_amount|
+-------+------------------+------------------+
|  count|             51243|             51243|
|   mean|108934.54793825498| 62.33719532423943|
| stddev| 67650.61013903584|156.60680083783237|
|    min|                10|               5.0|
|    max|             99990|            4500.0|
+-------+------------------+------------------+



desc = [summary: string, customer_id: string ... 1 more field]


[summary: string, customer_id: string ... 1 more field]

In [None]:
import org.apache.spark.sql.functions._
val enriched = data
                .withColumn("year_of_purchase", year(col("date_of_purchase")))
                .cache
enrich.show(5)

**Number of purchases per year**

In [None]:
enriched
    .groupBy(col("year_of_purchase"))
    .agg(count("*").alias("count"))
    .orderBy(col("year_of_purchase"))
    .show()

**Average purchase amount per year**

In [None]:
enriched
    .groupBy(col("year_of_purchase"))
    .agg(avg("purchase_amount").alias("avg_amount"))
    .orderBy("year_of_purchase")
    .show()

**All in one**

In [None]:
enriched
    .groupBy($"year_of_purchase")
    .agg(
        count($"year_of_purchase").alias("count"),
        sum($"purchase_amount").alias("sum_amount"),
        avg($"purchase_amount").alias("avg_amount"))
    .orderBy($"year_of_purchase")
    .show()