In [1]:
from datetime import date, timedelta
from typing import List

from abc import ABC
from abc import abstractmethod

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame

from create_data import recreate_databases, create_tables

In [2]:
spark = SparkSession.builder.appName("DM").getOrCreate()

# Create sample data

In [3]:
recreate_databases(spark)
create_tables(spark)

2021-11-25 12:05:06.603 | INFO     | create_data:recreate_databases:28 - Removing spark warehouse (path = 'spark-warehouse')
2021-11-25 12:05:06.650 | INFO     | create_data:recreate_databases:32 - Creating database 'standardized_glovo_live'
2021-11-25 12:05:10.775 | INFO     | create_data:recreate_databases:32 - Creating database 'mpcustomer_custom_events'
2021-11-25 12:05:10.871 | INFO     | create_data:recreate_databases:32 - Creating database 'mpcustomer_screen_views'
2021-11-25 12:05:10.957 | INFO     | create_data:recreate_databases:32 - Creating database 'enriched_custom_events'
2021-11-25 12:05:11.010 | INFO     | create_data:recreate_databases:32 - Creating database 'enriched_screen_views'
2021-11-25 12:05:11.082 | INFO     | create_data:create_table:63 - Creating table 'standardized_glovo_live.cities'
2021-11-25 12:05:23.506 | INFO     | create_data:create_table:63 - Creating table 'standardized_glovo_live.devices'
2021-11-25 12:05:32.152 | INFO     | create_data:create_table

# Loaders

In [5]:
class Loader(ABC):
    def __init__(self, spark: SparkSession):
        self.spark = spark
        self.sdf = self.load()

    @abstractmethod
    def load(self) -> DataFrame:
        raise NotImplementedError

class LoaderLiveDB(Loader):
    database_in = "standardized_glovo_live"
    
class LoaderCustomEvent(Loader):
    database_in = "mpcustomer_custom_events"
    
class LoaderScreenView(Loader):
    database_in = "mpcustomer_screen_views"

# Writers

In [6]:
class Writer(ABC):

    @abstractmethod
    def write(self):
        raise NotImplementedError

class WriterCustomEvent(Writer):
    database_out = "enriched_custom_events"

class WriterScreenView(Writer):
    database_out = "enriched_screen_views"

# Ports

In [9]:
class CitiesPort(LoaderLiveDB):
    code = "code"
    time_zone = "time_zone"
    country_code = "country_code"
    
    def load(self) -> DataFrame:
        sdf = self.spark.table(f"{self.database_in}.cities")
        return sdf.select(self.code, self.time_zone, self.country_code)


class DevicesPort(LoaderLiveDB):
    device_id = "custom_attributes__device_id"
    experiment_score = "device_experiment_score"

    def load(self) -> DataFrame:
        sdf = self.spark.table(f"{self.database_in}.devices")
        return sdf.selectExpr(
            f"id AS {self.device_id}",
            f"experiment_score AS {self.experiment_score}",
        )

In [10]:
class CustomEventPort(LoaderCustomEvent, WriterCustomEvent):
    
    creation_date = "p_creation_date"
    city = "custom_attributes__city"
    
    def __init__(self, spark, exec_date: date, n_days: int):
        self.exec_date = exec_date
        self.n_days = n_days

        super().__init__(spark)

    @property
    @abstractmethod
    def name(self):
        raise NotImplementedError

    def load(self) -> DataFrame:
        
        start = self.exec_date - timedelta(days=self.n_days)
        end = self.exec_date
        
        sdf = self.spark.table(f"{self.database_in}.{self.name}")

        return sdf.filter(
            f"{self.creation_date} BETWEEN '{start:%Y-%m-%d}' AND '{end:%Y-%m-%d}'"
        )

    def write(self):
        self.sdf.write.format("parquet").saveAsTable(f"{self.database_out}.{self.name}")

In [11]:
class OrderCreatedPort(CustomEventPort):
    name = "order_created"

# Transformations

In [12]:
class Transformation(ABC):

    @abstractmethod
    def apply(self, sdf: DataFrame) -> DataFrame:
        raise NotImplementedError

class Table(ABC):

    @abstractmethod
    def sdf(self):
        raise NotImplementedError

In [13]:
class AddTimezone(Transformation):
    def __init__(self, cities: CitiesPort):
        self.cities = cities

    def apply(self, table: Table) -> Table:
        
        table.sdf = table.sdf.join(
            F.broadcast(self.cities.sdf),
            on=table.sdf[table.city] == self.cities.sdf[self.cities.code],
            how="left",
        ).drop(self.cities.code)

        return table

# Jobs

In [14]:
class TransformLinearly:
    
    def __init__(self, table: Table, transformations: List[Transformation]):
        self.table = table
        self.transformations = transformations

    def run(self):
        for transformation in self.transformations:
            self.table = transformation.apply(self.table)

        self.table.write()

## Order Created

In [15]:
order_created = OrderCreatedPort(spark, exec_date=date(2021, 11, 19), n_days=3)
transformations = [
    AddTimezone(CitiesPort(spark))
]

order_created_job = TransformLinearly(
    table=order_created,
    transformations=transformations,
)
order_created_job.run()

In [16]:
spark.table(f"{order_created.database_out}.{order_created.name}").show()

+-----------------------+---------------+-------------+------------+
|custom_attributes__city|p_creation_date|    time_zone|country_code|
+-----------------------+---------------+-------------+------------+
|                    BCN|     2021-11-19|Europe/Madrid|          ES|
|                    BCN|     2021-11-18|Europe/Madrid|          ES|
|                    CAG|     2021-11-17|  Europe/Rome|          IT|
+-----------------------+---------------+-------------+------------+

