In [1]:
import pyspark
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/18 17:46:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql import Row
from datetime import datetime
from pyspark.sql.functions import date_format
from pyspark.sql.functions import to_date

In [4]:
data = [
    Row(player_id=1, device_id=2, event_date=datetime(2016, 3, 1), games_played=5),
    Row(player_id=1, device_id=2, event_date=datetime(2016, 3, 2), games_played=6),
    Row(player_id=2, device_id=3, event_date=datetime(2017, 6, 25), games_played=1),
    Row(player_id=3, device_id=1, event_date=datetime(2016, 3, 1), games_played=0),
    Row(player_id=3, device_id=4, event_date=datetime(2016, 7, 3), games_played=5)
]
# Convert the list of Row objects into a DataFrame
df = spark.createDataFrame(data)
df = df.withColumn("event_date", to_date(date_format("event_date", "yyyy-MM-dd")))
df.show()
print(df.dtypes)

                                                                                

+---------+---------+----------+------------+
|player_id|device_id|event_date|games_played|
+---------+---------+----------+------------+
|        1|        2|2016-03-01|           5|
|        1|        2|2016-03-02|           6|
|        2|        3|2017-06-25|           1|
|        3|        1|2016-03-01|           0|
|        3|        4|2016-07-03|           5|
+---------+---------+----------+------------+

[('player_id', 'bigint'), ('device_id', 'bigint'), ('event_date', 'date'), ('games_played', 'bigint')]


In [5]:
df.createOrReplaceTempView("Activity")

In [16]:
result = spark.sql(
    """
    WITH installation AS (
    SELECT 
        MIN(event_date) AS install_dt, 
        player_id
    FROM 
        Activity
    GROUP BY 
        player_id
),
next_play AS (
    SELECT 
        player_id, 
        event_date,
        CASE 
            WHEN DATE_SUB(IFNULL(LEAD(event_date) OVER(PARTITION BY player_id ORDER BY event_date), event_date), 1) = event_date
            THEN 1 
            ELSE 0 
        END AS next_date
    FROM 
        Activity
)
SELECT 
    i.install_dt, 
    COUNT(i.player_id) AS installs, 
    ROUND(SUM(n.next_date) / COUNT(*), 2) AS Day1_retention
FROM 
    installation i 
    LEFT JOIN next_play n ON i.player_id = n.player_id AND i.install_dt = n.event_date
GROUP BY 
    i.install_dt
    """
)
result.show()

                                                                                

+----------+--------+--------------+
|install_dt|installs|Day1_retention|
+----------+--------+--------------+
|2016-03-01|       2|           0.5|
|2017-06-25|       1|           0.0|
+----------+--------+--------------+



In [19]:
result = spark.sql(
    """
    SELECT 
    a1.*,a2.event_date,a3.event_date
FROM 
    Activity a1 
    LEFT JOIN Activity a2 ON a1.player_id = a2.player_id AND a1.event_date > a2.event_date
    LEFT JOIN Activity a3 ON a1.player_id = a3.player_id AND DATEDIFF(a3.event_date, a1.event_date) = 1
    """
)
result.show()

[Stage 83:>                                                         (0 + 8) / 8]                                                                                

+---------+---------+----------+------------+----------+----------+
|player_id|device_id|event_date|games_played|event_date|event_date|
+---------+---------+----------+------------+----------+----------+
|        1|        2|2016-03-01|           5|      NULL|2016-03-02|
|        1|        2|2016-03-02|           6|2016-03-01|      NULL|
|        2|        3|2017-06-25|           1|      NULL|      NULL|
|        3|        1|2016-03-01|           0|      NULL|      NULL|
|        3|        4|2016-07-03|           5|2016-03-01|      NULL|
+---------+---------+----------+------------+----------+----------+



In [20]:
result = spark.sql(
    """
    SELECT 
    a1.event_date AS install_dt, 
    COUNT(a1.player_id) AS installs, 
    ROUND(COUNT(a3.player_id) / COUNT(a1.player_id), 2) AS Day1_retention
FROM 
    Activity a1 
    LEFT JOIN Activity a2 ON a1.player_id = a2.player_id AND a1.event_date > a2.event_date
    LEFT JOIN Activity a3 ON a1.player_id = a3.player_id AND DATEDIFF(a3.event_date, a1.event_date) = 1
WHERE 
    a2.event_date IS NULL
GROUP BY 
    a1.event_date;
    """
)
result.show()

[Stage 89:>                                                         (0 + 8) / 8]                                                                                

+----------+--------+--------------+
|install_dt|installs|Day1_retention|
+----------+--------+--------------+
|2016-03-01|       2|           0.5|
|2017-06-25|       1|           0.0|
+----------+--------+--------------+

