In [1]:
import pyspark
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/22 22:09:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from datetime import datetime
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType

In [27]:
from pyspark.sql.functions import to_date
schema = StructType([
    StructField("player_id", IntegerType(), True),
    StructField("device_id", IntegerType(), True),
    StructField("event_date", StringType(), True),  # Define the event_date as StringType
    StructField("games_played", IntegerType(), True)
])

# Create a list of Row objects with date as string
data = [
    Row(player_id=1, device_id=2, event_date="2016-03-01", games_played=5),
    Row(player_id=1, device_id=2, event_date="2016-03-02", games_played=6),
    Row(player_id=2, device_id=3, event_date="2017-06-25", games_played=1),
    Row(player_id=3, device_id=1, event_date="2016-03-02", games_played=0),
    Row(player_id=3, device_id=4, event_date="2018-07-03", games_played=5)
]

# Convert the list to a DataFrame with the specified schema
df = spark.createDataFrame(data, schema=schema)

# Convert event_date from StringType to DateType
df = df.withColumn("event_date", to_date(df.event_date, "yyyy-MM-dd"))

# Show the DataFrame
df.show()

# Print the data types of the DataFrame
print(df.dtypes)

+---------+---------+----------+------------+
|player_id|device_id|event_date|games_played|
+---------+---------+----------+------------+
|        1|        2|2016-03-01|           5|
|        1|        2|2016-03-02|           6|
|        2|        3|2017-06-25|           1|
|        3|        1|2016-03-02|           0|
|        3|        4|2018-07-03|           5|
+---------+---------+----------+------------+

[('player_id', 'int'), ('device_id', 'int'), ('event_date', 'date'), ('games_played', 'int')]


In [28]:
df.createOrReplaceTempView("Activity")

In [29]:
result = spark.sql(
    """
    select *
from Activity
where (player_id, event_date) in (select player_id, min(event_date) from Activity group by 1)
    """
)
result.show()



+---------+---------+----------+------------+
|player_id|device_id|event_date|games_played|
+---------+---------+----------+------------+
|        1|        2|2016-03-01|           5|
|        2|        3|2017-06-25|           1|
|        3|        1|2016-03-02|           0|
+---------+---------+----------+------------+



In [30]:
result = spark.sql(
    """
    select round(count(if(datediff(next_date, event_date)=1, 1, null))/count(*),2) as fraction 
    from (select player_id, event_date, lead(event_date) over(partition by player_id order by event_date) as next_date 
        from Activity) r
    where (player_id, event_date) in (select player_id, min(event_date) from Activity group by 1)
    """
)
result.show()

+--------+
|fraction|
+--------+
|    0.33|
+--------+





In [31]:
result = spark.sql(
    """
    with first_login as (
    select player_id, min(event_date) as first_date
    from Activity
    group by 1
),
next_login as (
    select a.player_id, a.event_date as next_date
    from Activity a join first_login f
        on a.player_id = f.player_id 
            and datediff(a.event_date, f.first_date) = 1
)
select round((select count(player_id) from next_login)/(select count(player_id) from first_login),2) as fraction
    """
)
result.show()

                                                                                

+--------+
|fraction|
+--------+
|    0.33|
+--------+



In [34]:
result = spark.sql(
    """
    select round(count(player_id)/(select count(distinct player_id) from Activity),2) as fraction
    from Activity 
    where (player_id, date_sub(event_date, 1)) in
        (select player_id, min(event_date) from Activity group by 1)
    """
)
result.show()

                                                                                

+--------+
|fraction|
+--------+
|    0.33|
+--------+

