In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame

from datetime import date, timedelta

# Loaders

In [2]:
from abc import ABC
from abc import abstractmethod

class Loader(ABC):
    def __init__(self, spark: SparkSession):
        self.spark = spark
        self.sdf = self.load()

    @abstractmethod
    def load(self) -> DataFrame:
        raise NotImplementedError
        
class LoaderLiveDB(Loader):
    database_in = "standardized_glovo_live"
    
class LoaderCustomEvent(Loader):
    database_in = "mpcustomer_custom_events"
    
class LoaderScreenView(Loader):
    database_in = "mpcustomer_screen_views"

# Writers

In [3]:
class Writer(ABC):

    @abstractmethod
    def save(self):
        raise NotImplementedError

class WriterCustomEvent(Writer):
    database_out = "enriched_custom_events"

class WriterScreenView(Writer):
    database_out = "enriched_screen_views"

# Ports

In [4]:
class CitiesPort(LoaderLiveDB):
    code = "code"
    time_zone = "time_zone"
    country_code = "country_code"
    
    def load(self) -> DataFrame:
        sdf = self.spark.table(f"{self.database_in}.cities")
        return sdf.select(self.code, self.time_zone, self.country_code)


class DevicesPort(LoaderLiveDB):
    device_id = "custom_attributes__device_id"
    experiment_score = "device_experiment_score"

    def load(self) -> DataFrame:
        sdf = self.spark.table(f"{self.database_in}.devices")
        return sdf.selectExpr(
            f"id AS {self.device_id}",
            f"experiment_score AS {self.experiment_score}",
        )

In [5]:
class CustomEvent(LoaderCustomEvent, WriterCustomEvent):
    
    creation_date = "p_creation_date"
    
    def __init__(spark, exec_date, n_days):
        self.exec_date = exec_date
        self.n_days = n_days

        super().self(spark)

    @property
    @abstractmethod
    def name(self):
        raise NotImplementedError

    def load(self) -> DataFrame:
        
        start = self.exec_date - timedelta(days=self.n_days)
        end = self.exec_date
        
        sdf = self.spark.table(f"{self.database_in}.{self.name}")

        return sdf.filter(
            f"{self.creation_date} BETWEEN '{start:%Y-%m-%d}' AND '{end:%Y-%m-%d}'"
        )

    def write(self):
        spark.write.format("parquet").saveAsTable(f"{self.database_out}.{self.name}")

# Transformations

In [6]:
class Transformation(ABC):

    @abstractmethod
    def apply(self, sdf: DataFrame) -> DataFrame:
        raise NotImplementedError
        
class Table(ABC):

    @abstractmethod
    def sdf(self):
        raise NotImplementedError

In [7]:
class AddTimezone(Transformation):
    def __init__(self, cities: CitiesPort):
        self.cities = cities

    def apply(self, table: Table) -> Table:
        
        table.sdf = table.sdf.join(
            F.broadcast(self.cities.sdf),
            on=table.sdf[table.city] == cities.sdf[cities.code],
            how="left",
        ).drop(cities.code)

        return table

# Jobs

In [8]:
class ApplyLinearly(ABC):
    
    def __init__(table: Table):
        self.table = table

    @property
    @abstractmethod
    def transformations(self):
        raise NotImplementedError

    @abstractmethod
    def run(self):
        for transformation in self.transformations:
            self.table = transformation.apply(self.table)

        self.table.write()