# Week 1 - Foundations of marketing analytics
https://www.coursera.org/learn/foundations-marketing-analytics/home/week/1

In [None]:
import org.apache.spark.sql.types._

val schema = StructType(
                List(
                    StructField("customer_id", StringType, false),
                    StructField("purchase_amount", DoubleType, false),
                    StructField("date_of_purchase", DateType, false)
                )
            )
val data = spark.read
                .option("sep", "\t")
                .option("mode","FAILFAST")
                .option("dateFormat","YYYY-mm-dd")
                //.option("inferSchema", true) //requires 1 pass over schema and parses date as full timestamp
                .schema(schema)
                .csv("../../data/foundation-marketing-analytics/purchases.txt")
                .toDF
data.printSchema()

In [None]:
data.show(5)
val desc = data.describe()
desc.show()

In [None]:
import org.apache.spark.sql.functions._
val enriched = data
                .withColumn("year_of_purchase", year(col("date_of_purchase")))
                .cache
enrich.show(5)

**Number of purchases per year**

In [None]:
enriched
    .groupBy(col("year_of_purchase"))
    .agg(count("*").alias("count"))
    .orderBy(col("year_of_purchase"))
    .show()

**Average purchase amount per year**

In [None]:
enriched
    .groupBy(col("year_of_purchase"))
    .agg(avg("purchase_amount").alias("avg_amount"))
    .orderBy("year_of_purchase")
    .show()

**All in one**

In [None]:
enriched
    .groupBy($"year_of_purchase")
    .agg(
        count($"year_of_purchase").alias("count"),
        sum($"purchase_amount").alias("sum_amount"),
        avg($"purchase_amount").alias("avg_amount"))
    .orderBy($"year_of_purchase")
    .show()