In [0]:
import calendar
import datetime
import numpy
import random
import uuid

from pyspark.sql import DataFrame, Row
from pyspark.sql import functions as F

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

In [0]:
def union_all(dataframes: list[DataFrame]) -> DataFrame | None:
    if len(dataframes) == 1:
        return dataframes[0]
    else:
        head, *tail = dataframes
        return head.unionByName(union_all(tail))

In [0]:
rng = numpy.random.default_rng()

def generate_day_of_month(year: int, month: int) -> datetime.date:
    return int(random.randint(1, calendar.monthrange(year, month)[1]))

def generate_date(year: int, month: int) -> datetime.date:
    return datetime.date(year, month, generate_day_of_month(year, month))    

In [0]:
def event(
    e_id: uuid.UUID | None = None, 
    e_type: str = "OTHER", 
    e_date: datetime.date = datetime.date.today(),
    u_id: int | None = None,
    u_location: str | None = None,
    ua_os: str | None = None,
    ua_version: str | None = None,
    tx_location: int | None = None,
    tx_price: float | None = None,
    err_message: str | None = None
) -> Row:
    
    if e_id is None:
        e_id = uuid.uuid4()

    return Row(** {
        "event_id": str(e_id),
        "event_type": e_type,
        "event_date": e_date,
        "u_id": u_id,
        "u_location": u_location,
        "ua_os": ua_os,
        "ua_version": ua_version,
        "tx_location": tx_location,
        "tx_price": tx_price,
        "err_message": err_message        
    })


EVENT_SCHEMA = StructType([
    StructField("event_id", StringType(), False), 
    StructField("event_type", StringType(), False),
    StructField("event_date", DateType(), False),
    StructField("u_id", IntegerType(), True),
    StructField("u_location", StringType(), True),
    StructField("ua_os", StringType(), True),
    StructField("ua_version", StringType(), True),
    StructField("tx_location", IntegerType(), True),
    StructField("tx_price", DoubleType(), True),
    StructField("err_message", StringType(), True)
])    

In [0]:
def generate_events_app_installation() -> DataFrame:

    def _generate(year: int, how_many: int, ua_os: str, ua_version: str, month_min: int = 1, month_max: int = 12) -> DataFrame:
        return spark.createDataFrame([event(
            e_type = "APP_INSTALLATION",
            e_date = generate_date(year, random.randint(month_min, month_max)),
            ua_version = ua_version,
            ua_os = ua_os
        ) for event_index in range(how_many)], EVENT_SCHEMA)

    return union_all([
        _generate(2020,  1200 + 1600 + 1800 + 3200 + 41, "iOS", "1.0"),
        _generate(2021,  1600 + 1200 + 1600 + 3600 + 33, "iOS", "1.1"),
        _generate(2022,  1400 + 1400 + 1600 + 3800 + 92, "iOS", "1.2"),
        _generate(2023,  1500 + 1200 + 1300 + 3600 + 467, "iOS", "1.3", month_max = 11),

        _generate(2020, 3200 + 2900 + 3600 + 7900 +  84, "Android", "1.0.0"),
        _generate(2021, 3400 + 3200 + 3200 + 6800 +  30, "Android", "1.1.0"),
        _generate(2022, 3200 + 2800 + 2700 + 4900 +  92, "Android", "1.2.0"),
        _generate(2023, 3600 + 3400 + 3200 + 5200 + 120, "Android", "1.3.0", month_max = 10),
        _generate(2023,  350 +  300  + 200  + 850 +   0, "Android", "1.3.1", month_min = 11, month_max = 11)
    ])

In [0]:
def generate_events_account_created() -> DataFrame:
    def _generate(year: int, how_many: int, max_uid: int, offset_uid: int, u_location: str, ua_os: str, ua_version: str, month_min: int = 1, month_max: int = 12) -> DataFrame:
        return spark.createDataFrame([event(
            e_type = "ACCOUNT_CREATED",
            e_date = generate_date(year, random.randint(month_min, month_max)),
            u_id = random.randint(1, max_uid) + offset_uid,
            u_location = u_location,
            ua_version = ua_version,
            ua_os = ua_os
        ) for event_index in range(how_many)], EVENT_SCHEMA)

      
    return union_all([
        # iOS - Białystok
        _generate(2020,  1200,  10_000,       0, "Białystok", "iOS", "1.0"),
        _generate(2021,  1600,  10_000,  10_000, "Białystok", "iOS", "1.1"),
        _generate(2022,  1400,  10_000,  20_000, "Białystok", "iOS", "1.2"),
        _generate(2023,  1500,  10_000,  30_000, "Białystok", "iOS", "1.3", month_max = 11),

        # iOS - Koszalin
        _generate(2020,  1600,  10_000,  40_000, "Koszalin", "iOS", "1.0"),
        _generate(2021,  1200,  10_000,  50_000, "Koszalin", "iOS", "1.1"),
        _generate(2022,  1400,  10_000,  60_000, "Koszalin", "iOS", "1.2"),
        _generate(2023,  1200,  10_000,  70_000, "Koszalin", "iOS", "1.3", month_max = 11),

        # iOS - Lublin
        _generate(2020,  1800,  10_000,  80_000, "Lublin", "iOS", "1.0"),
        _generate(2021,  1600,  10_000,  90_000, "Lublin", "iOS", "1.1"),
        _generate(2022,  1600,  10_000, 100_000, "Lublin", "iOS", "1.2"),
        _generate(2023,  1300,  10_000, 110_000, "Lublin", "iOS", "1.3", month_max = 11),

        # iOS - Warszawa
        _generate(2020,  3200,  10_000, 120_000, "Warszawa", "iOS", "1.0"),
        _generate(2021,  3600,  10_000, 130_000, "Warszawa", "iOS", "1.1"),
        _generate(2022,  3800,  10_000, 140_000, "Warszawa", "iOS", "1.2"),
        _generate(2023,  3600,  10_000, 150_000, "Warszawa", "iOS", "1.3", month_max = 11),


        # Android - Białystok
        _generate(2020,  3200,  10_000,  160_000, "Białystok", "Android", "1.0.0"),
        _generate(2021,  3400,  10_000,  170_000, "Białystok", "Android", "1.1.0"),
        _generate(2022,  3200,  10_000,  180_000, "Białystok", "Android", "1.2.0"),
        _generate(2023,  3600,  10_000,  190_000, "Białystok", "Android", "1.3.0", month_max=10),
        _generate(2023,  350,   10_000,  190_000, "Białystok", "Android", "1.3.1", month_min=11, month_max=11),

        # Android - Koszalin
        _generate(2020,  2900,  10_000,  200_000, "Koszalin", "Android", "1.0.0"),
        _generate(2021,  3200,  10_000,  210_000, "Koszalin", "Android", "1.1.0"),
        _generate(2022,  2800,  10_000,  220_000, "Koszalin", "Android", "1.2.0"),     
        _generate(2023,  3400,  10_000,  230_000, "Koszalin", "Android", "1.3.0", month_max=10),
        _generate(2023,  300,   10_000,  240_000, "Koszalin", "Android", "1.3.1", month_min=11, month_max=11),

        # Android - Lublin
        _generate(2020,   3600,  10_000,  250_000, "Lublin", "Android", "1.0.0"),
        _generate(2021,   3200,  10_000,  260_000, "Lublin", "Android", "1.1.0"),
        _generate(2022,   2700,  10_000,  270_000, "Lublin", "Android", "1.2.0"),      
        _generate(2023,   3200,  10_000,  280_000, "Lublin", "Android", "1.3.0", month_max=10),
        _generate(2023,   200,   10_000,  290_000, "Lublin", "Android", "1.3.1", month_min=11, month_max=11),

        # Android - Warszawa
        _generate(2020,  7900,  10_000,  300_000, "Warszawa", "Android", "1.0.0"),
        _generate(2021,  6800,  10_000,  310_000, "Warszawa", "Android", "1.1.0"),
        _generate(2022,  4900,  10_000,  320_000, "Warszawa", "Android", "1.2.0"),      
        _generate(2023,  5200,  10_000,  330_000, "Warszawa", "Android", "1.3.0", month_max=10),
        _generate(2023,    60,  10_000,  340_000, "Koszalin", "Android", "1.3.1", month_min=11, month_max=11)
    ])

In [0]:
def generate_events_error() -> DataFrame:

    all_errors = [
        "Other error",

        "Invalid street name: Jana Pawla 2",
        "Invalid street name: Krakwska",
        "Invalid street name: ",
        "Invalid street name: dworcowa",
        "Invalid street name: CH Sienkiewicza",
        "Invalid street name: targowa",
        "Invalid street name: Poznań",

        "Invalid value: NaN",
        "Invalid street name: name can't be shorter than 90 characters",
    ]

    famous_error_1 = "Invalid street name: ꯕꯦꯅꯥꯒꯥ ꯂꯣꯌꯅꯅꯥ ꯄꯦꯅꯀꯦꯛ꯫"
    famous_error_2 = "Invalid street name: Al. Niepodległości"
    famous_error_3 = "Invalid street name: Aleja Niepodległości"
    famous_error_4 = "Invalid street name: Aleje Niepodleglosci"

    def _generate(year: int, how_many: int, min_uid: int, max_uid: int, u_location: str, ua_os: str, ua_version: str, error: str | None, month_min: int = 1, month_max: int = 12) -> DataFrame:
        return spark.createDataFrame([event(
            e_type = "REGISTRATION_ERROR",
            e_date = generate_date(year, random.randint(month_min, month_max)),
            u_id = random.randrange(min_uid, max_uid),
            u_location = u_location,
            ua_version = ua_version,
            ua_os = ua_os,
            err_message = (error or str(rng.choice(all_errors, 1)[0]))
        ) for event_index in range(how_many)], EVENT_SCHEMA)

    return union_all([
        _generate(2020,    5,      0,   10_000, "Białystok", "iOS", "1.0", None),
        _generate(2021,    3,  10_000,  20_000, "Białystok", "iOS", "1.1", None),
        _generate(2022,    8,  20_000,  30_000, "Białystok", "iOS", "1.2", None),
        _generate(2023,   16,  30_000,  40_000, "Białystok", "iOS", "1.3", None, month_max = 11),

        # iOS - Koszalin
        _generate(2020,    5,  40_000,  50_000, "Koszalin", "iOS", "1.0", None),
        _generate(2021,    3,  50_000,  60_000, "Koszalin", "iOS", "1.1", None),
        _generate(2022,    8,  60_000,  70_000, "Koszalin", "iOS", "1.2", None),
        _generate(2023,   16,  70_000,  80_000, "Koszalin", "iOS", "1.3", None, month_max = 11),

        # iOS - Lublin
        _generate(2020,    4,  80_000,  90_000, "Lublin", "iOS", "1.0", None),
        _generate(2021,    8,  90_000, 100_000, "Lublin", "iOS", "1.1", None),
        _generate(2022,    5, 100_000, 110_000, "Lublin", "iOS", "1.2", None),
        _generate(2023,    3, 110_000, 120_000, "Lublin", "iOS", "1.3", None, month_max = 11),

        # iOS - Warszawa
        _generate(2020,    8, 120_000, 130_000, "Warszawa", "iOS", "1.0", None),
        _generate(2021,   20, 130_000, 140_000, "Warszawa", "iOS", "1.1", None),
        _generate(2022,   17, 140_000, 150_000, "Warszawa", "iOS", "1.2", None),
        _generate(2023,   24, 150_000, 160_000, "Warszawa", "iOS", "1.3", None, month_max = 11),


        # Android - Białystok
        _generate(2020,    9, 160_000,  170_000, "Białystok", "Android", "1.0.0", None),
        _generate(2021,   16, 170_000,  180_000, "Białystok", "Android", "1.1.0", None),
        _generate(2022,    0, 180_000,  190_000, "Białystok", "Android", "1.2.0", None),       
        _generate(2023,   13, 190_000,  200_000, "Białystok", "Android", "1.3.0", None, month_max = 11),

        # Android - Koszalin
        _generate(2020,   12, 200_000,  210_000, "Koszalin", "Android", "1.0.0", None),
        _generate(2021,   16, 210_000,  220_000, "Koszalin", "Android", "1.1.0", None),
        _generate(2022,    3, 220_000,  230_000, "Koszalin", "Android", "1.2.0", None),      
        _generate(2023,   22, 230_000,  240_000, "Koszalin", "Android", "1.3.0", None, month_max = 11),

        # Android - Lublin
        _generate(2020,   9, 240_000,  250_000, "Lublin", "Android", "1.0.0", None),
        _generate(2021,  15, 250_000,  260_000, "Lublin", "Android", "1.1.0", None),
        _generate(2022,   0, 260_000,  270_000, "Lublin", "Android", "1.2.0", None),      
        _generate(2023,  10, 270_000,  280_000, "Lublin", "Android", "1.3.0", None, month_max = 11),

        # Android - Warszawa
        _generate(2020,  35, 280_000, 290_000, "Warszawa", "Android", "1.0.0", None),
        _generate(2021,  33, 290_000, 300_000, "Warszawa", "Android", "1.1.0", None),
        _generate(2021, 985, 290_324, 290_564, "Warszawa", "Android", "1.1.0", famous_error_1, month_min = 6, month_max = 6),
        _generate(2022,  44, 300_000, 310_000, "Warszawa", "Android", "1.3.0", None),
        _generate(2023,   6, 310_000, 320_000, "Warszawa", "Android", "1.3.0", None,           month_min = 1, month_max =  10),
        _generate(2023, 430, 320_000, 330_000, "Warszawa", "Android", "1.3.1", famous_error_2, month_min = 11, month_max = 11),
        _generate(2023, 385, 320_000, 330_000, "Warszawa", "Android", "1.3.1", famous_error_3, month_min = 11, month_max = 11),
        _generate(2023, 550, 320_000, 330_000, "Warszawa", "Android", "1.3.1", famous_error_4, month_min = 11, month_max = 11),
        _generate(2023,  26, 330_000, 340_000, "Warszawa", "Android", "1.3.1", None,           month_min = 11, month_max = 11) 
    ])

In [0]:
def generate_events_login() -> DataFrame:
    def _generate(year: int, how_many: int, max_uid: int, offset_uid: int, ua_os: str, ua_version: str, month_min: int = 1, month_max: int = 12) -> DataFrame:
        return spark.createDataFrame([event(
            e_type = "LOGIN",
            e_date = generate_date(year, random.randint(month_min, month_max)),
            u_id = random.randint(1, max_uid) + offset_uid,
            ua_version = ua_version,
            ua_os = ua_os
        ) for event_index in range(how_many)], EVENT_SCHEMA)

    def _generate_spam(how_many: int) -> DataFrame:
        return spark.createDataFrame([event(
            e_type = "LOGIN",
            e_date = generate_date(2023, 11),
            u_id = 49_392,
            ua_version = "1.3",
            ua_os = "iOS"
        ) for event_index in range(how_many)], EVENT_SCHEMA)
       
    return union_all([
        _generate(2020,  8_000, 1_000, 0, "iOS", "1.0"),
        _generate(2021, 10_000,   850, 0, "iOS", "1.1"),
        _generate(2022, 15_000, 1_050, 0, "iOS", "1.2"),
        _generate(2023, 20_000, 1_230, 0, "iOS", "1.3", month_max = 11),

        _generate(2020, 15_000,  3_000, 50_000, "Android", "1.0.0"),
        _generate(2021, 20_000,  7_000, 50_000, "Android", "1.1.0"),
        _generate(2022,  6_000,  9_500, 50_000, "Android", "1.2.0", month_min = 1, month_max =  2),
        _generate(2022, 30_000,  9_500, 50_000, "Android", "1.2.1", month_min = 3, month_max = 12),
        _generate(2023, 35_000, 12_000, 50_000, "Android", "1.3.0", month_max = 11),

        _generate_spam(13_300)
    ])

In [0]:
def generate_events_transaction() -> DataFrame:
    def _generate(year: int, how_many: int, max_uid: int, offset_uid: int, ua_os: str, ua_version: str, month_min: int = 1, month_max: int = 12) -> DataFrame:
        return spark.createDataFrame([event(
            e_type = "TRANSACTION",
            e_date = generate_date(year, random.randint(month_min, month_max)),
            u_id = random.randint(1, max_uid) + offset_uid,
            u_location = str(rng.choice(["Warszawa", "Warszawa", "Warszawa", "Koszalin", "Białystok", "Lublin"], 1)[0]),
            ua_version = ua_version,
            ua_os = ua_os,
            tx_location = random.randint(1, 10_000),
            tx_price = (lambda os: random.randint(100, 500_000) / 100.0 if os == "iOS" else random.randint(500, 250_000) / 100.0)(ua_os)
        ) for event_index in range(how_many * random.randint(1, 3))], EVENT_SCHEMA)
       
    return union_all([
        _generate(2020,  8_000, 1_000, 0, "iOS", "1.0"),
        _generate(2021, 10_000,   850, 0, "iOS", "1.1"),
        _generate(2022, 15_000, 1_050, 0, "iOS", "1.2"),
        _generate(2023, 20_000, 1_230, 0, "iOS", "1.3", month_max = 11),

        _generate(2020, 15_000,  3_000, 50_000, "Android", "1.0.0"),
        _generate(2021, 20_000,  7_000, 50_000, "Android", "1.1.0"),
        _generate(2022,    200,  9_500, 50_000, "Android", "1.2.0", month_min = 1, month_max =  2),
        _generate(2022, 30_000,  9_500, 50_000, "Android", "1.2.1", month_min = 3, month_max = 12),
        _generate(2023, 35_000, 12_000, 50_000, "Android", "1.3.0", month_max = 11)
    ])

In [0]:
USERS_WITH_STORIES = [x + 500_000 for x in [103, 105, 312, 478, 579, 610, 777, 810]]

events = []

for user_id in USERS_WITH_STORIES:
    retries = int(random.randint(2, 6))
    error = str(rng.choice(["Invalid street name: Al. Niepodległości", "Invalid street name: Aleja Niepodległości", "Invalid street name: Aleje Niepodleglosci"], 1)[0])
    day = random.randint(1, 20)

    events.append(event(
        e_type = "ACCOUNT_CREATED",
        e_date = datetime.date(2023, 11, day),
        u_id = int(user_id),
        u_location = "Warszawa",
        ua_version = "1.3.1",
        ua_os = "Android"
    ))

    for r in range(random.randint(2, 6)):
        for retry in range(random.randint(5, 8)):
            events.append(event(
                e_type = "REGISTRATION_ERROR",
                e_date = datetime.date(2023, 11, day + r + 1),
                u_id = int(user_id),
                u_location = "Warszawa",
                ua_version = "1.3.1",
                ua_os = "Android",
                err_message = error
            ))

In [0]:
generate_events_app_installation().write.mode("overwrite").saveAsTable("events")
generate_events_account_created().write.mode("append").saveAsTable("events")
generate_events_error().write.mode("append").saveAsTable("events")
generate_events_login().write.mode("append").saveAsTable("events")
generate_events_transaction().write.mode("append").saveAsTable("events")

spark.createDataFrame(events, EVENT_SCHEMA).write.mode("append").saveAsTable("events")

In [0]:
spark.sql("SELECT * FROM events").write.mode("overwrite").parquet("/FileStore/orlen/mobile-application-events/events.parquet")