<a href="https://colab.research.google.com/github/tyri0n11/distributed-system/blob/main/7_2_regression_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark import SparkContext
sc = SparkContext(master = 'local[10]')

from pyspark.sql import SparkSession
spark = SparkSession.builder \
          .appName("Python Spark SQL basic example") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()

# Linear regression without cross-valiation

In [2]:
ad = spark.read.csv('./Advertising.csv', header=True, inferSchema=True)
ad.show(5)

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows


In [3]:
type(ad)

## Transform data structure

In [4]:
from pyspark.ml.linalg import Vectors
ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label'])
ad_df.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows


## Build linear regression model

In [5]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol = 'label')

In [6]:
type(lr)

## Fit the model

In [7]:
lr_model = lr.fit(ad_df)

In [8]:
type(lr_model)

In [9]:
lr_model

LinearRegressionModel: uid=LinearRegression_e73692ae1e31, numFeatures=3

## Prediction

In [10]:
pred = lr_model.transform(ad_df)
pred.show(5)

+-----------------+-----+------------------+
|         features|label|        prediction|
+-----------------+-----+------------------+
|[230.1,37.8,69.2]| 22.1| 20.52397440971517|
| [44.5,39.3,45.1]| 10.4|12.337854820894362|
| [17.2,45.9,69.3]|  9.3|12.307670779994238|
|[151.5,41.3,58.5]| 18.5| 17.59782951168913|
|[180.8,10.8,58.4]| 12.9|13.188671856831299|
+-----------------+-----+------------------+
only showing top 5 rows


## Module evaluation

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
evaluator.setMetricName('r2').evaluate(pred)

0.897210638178952

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
evaluator.setMetricName('mse').evaluate(pred)

2.784126314510938

In [13]:
evaluator.setMetricName('mae').evaluate(pred)

1.2520112296870693

### Exercise

1. Do research in Spark documentaion to study & practice in other regression models.
2. Investigate other metrics for evaluating regression model in Spark.
3. Practice on `Advertising` dataset.
4. Do the same thing, but now you should split into `train` and `test` datasets, so the fitting modelling in the `train`, then do the evaluation in the `test`

# Linear regression with cross-validation in Spark

In [14]:
training, test = ad_df.randomSplit([0.8, 0.2], seed=123)

In [15]:
##=====build cross valiation model======

# estimator
lr = LinearRegression(featuresCol = 'features', labelCol = 'label')

# parameter grid
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
    addGrid(lr.regParam, [0, 0.5, 1]).\
    addGrid(lr.elasticNetParam, [0, 0.5, 1]).\
    build()

# evaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='r2')

# cross-validation model
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

In [16]:
type(cv)

In [17]:
cv_model = cv.fit(training)

In [18]:
type(cv_model)

In [19]:
cv_model

CrossValidatorModel_dbfd95ffd5cd

In [20]:
pred_training_cv = cv_model.transform(training)
pred_test_cv = cv_model.transform(test)

In [21]:
# performance on training data
evaluator.setMetricName('r2').evaluate(pred_training_cv)

0.8952845631627804

In [22]:
# performance on test data
evaluator.setMetricName('r2').evaluate(pred_test_cv)

0.9013819610158472

## Intercept and coefficients

In [23]:
print('Intercept: ', cv_model.bestModel.intercept, "\n",
     'coefficients: ', cv_model.bestModel.coefficients)

Intercept:  2.9592600706772787 
 coefficients:  [0.04613729524909818,0.19200356629524312,-0.006269704193266422]


In [24]:
ad_df.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows


## Get parameter values from the best model

In [25]:
print('best regParam: ' + str(cv_model.bestModel._java_obj.getRegParam()) + "\n" +
     'best ElasticNetParam:' + str(cv_model.bestModel._java_obj.getElasticNetParam()))

best regParam: 0.0
best ElasticNetParam:0.0


### Exercise

Wrap up the code of building cross-validation models in a Python class

### Exercise

Do the regression to forecast the `inside_sale` of this data: https://github.com/maks-p/restaurant_sales_forecasting/blob/master/csv/CSV_for_EDA_NEW.csv

In [78]:
df_sales = spark.read.csv('./CSV_for_EDA_NEW.csv', header=True, inferSchema=True)
df_sales.show(5)
df_sales.printSchema()

+----------+------------+-------------+-------------+--------------+---------------+-------------+---------------+--------------+---------------+--------------------+--------+--------------------+-------------------+-----------+-----------+--------+-------------------+-----------+
|      date|inside_sales|outside_sales|inside_covers|outside_covers|reserved_covers|walkin_covers|waitlist_covers|no_show_covers|no_show_parties|apparent_temperature|humidity|precip_intensity_max|    precip_max_time|precip_prob|precip_type|pressure|            summary|temperature|
+----------+------------+-------------+-------------+--------------+---------------+-------------+---------------+--------------+---------------+--------------------+--------+--------------------+-------------------+-----------+-----------+--------+-------------------+-----------+
|2017-01-02|    13159.84|          0.0|          174|             0|            106|           26|             42|            17|              6|         

In [81]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, dayofweek, month, year
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer,
    OneHotEncoder,
    VectorAssembler
)
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator


class InsideSalesGBTRegression:
    """
    Nonlinear regression pipeline (GBT) for inside_sales forecasting
    """

    def __init__(self):
        self.pipeline = None
        self.model = None

    def _feature_engineering(self, df: DataFrame) -> DataFrame:
        return (
            df
            .withColumn("day_of_week", dayofweek(col("date")))
            .withColumn("month", month(col("date")))
            .withColumn("year", year(col("date")))
        )

    def build_pipeline(self) -> Pipeline:

        categorical_cols = ["precip_type", "summary"]

        numeric_cols = [
            # operational
            "outside_sales",
            "inside_covers",
            "outside_covers",
            "reserved_covers",
            "walkin_covers",
            "waitlist_covers",
            "no_show_covers",
            "no_show_parties",

            # weather
            "apparent_temperature",
            "humidity",
            "precip_intensity_max",
            "precip_prob",
            "pressure",
            "temperature",

            # calendar
            "day_of_week",
            "month",
            "year",

        ]

        indexers = [
            StringIndexer(
                inputCol=c,
                outputCol=f"{c}_idx",
                handleInvalid="keep"
            )
            for c in categorical_cols
        ]

        encoders = [
            OneHotEncoder(
                inputCol=f"{c}_idx",
                outputCol=f"{c}_ohe"
            )
            for c in categorical_cols
        ]

        assembler = VectorAssembler(
            inputCols=numeric_cols + [f"{c}_ohe" for c in categorical_cols],
            outputCol="features",
            handleInvalid="keep"
        )

        gbt = GBTRegressor(
            labelCol="inside_sales",
            featuresCol="features",
            maxDepth=6,
            maxIter=150,
            stepSize=0.05,
            subsamplingRate=0.8,
            seed=42
        )

        self.pipeline = Pipeline(
            stages=indexers + encoders + [assembler, gbt]
        )

        return self.pipeline

    def fit(self, df: DataFrame):
        df = self._feature_engineering(df)
        self.pipeline = self.build_pipeline()
        self.model = self.pipeline.fit(df)
        return self

    def predict(self, df: DataFrame) -> DataFrame:
        df = self._feature_engineering(df)
        return self.model.transform(df)

    def evaluate(self, df: DataFrame) -> dict:
        preds = self.predict(df)

        rmse = RegressionEvaluator(
            labelCol="inside_sales",
            predictionCol="prediction",
            metricName="rmse"
        ).evaluate(preds)

        r2 = RegressionEvaluator(
            labelCol="inside_sales",
            predictionCol="prediction",
            metricName="r2"
        ).evaluate(preds)

        return {"RMSE": rmse, "R2": r2}


In [82]:
# Time-based split (mandatory for forecasting)
train_df = df_sales.filter(col("date") < "2018-01-01")
test_df  = df_sales.filter(col("date") >= "2018-01-01")

model = InsideSalesGBTRegression()
model.fit(train_df)

metrics = model.evaluate(test_df)
print(metrics)

preds = model.predict(test_df)
preds.select("date", "inside_sales", "prediction").show(10)


{'RMSE': 2396.8506247214304, 'R2': 0.0025959482984100335}
+----------+------------+------------------+
|      date|inside_sales|        prediction|
+----------+------------+------------------+
|2018-01-01|       381.0|10466.190034976446|
|2018-01-02|    11591.75| 11601.33193561661|
|2018-01-03|    12052.11|14193.950460682361|
|2018-01-04|    12296.98|13829.138665249837|
|2018-01-05|    15831.14| 15697.66069566717|
|2018-01-06|    17706.94| 17822.87842499601|
|2018-01-07|    12013.59| 13303.11879410541|
|2018-01-08|    10950.13|  13794.3483972079|
|2018-01-09|    13713.21|13667.211944422634|
|2018-01-10|    13153.22|13640.368901588176|
+----------+------------+------------------+
only showing top 10 rows


# Generalized regression

In [40]:
cuse = spark.read.csv('./cuse_binary.csv', header=True, inferSchema=True)
cuse.show(5)

+---+---------+---------+---+
|age|education|wantsMore|  y|
+---+---------+---------+---+
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
+---+---------+---------+---+
only showing top 5 rows


In [41]:
cuse.columns[0:3]
# cuse.select('age').distinct().show()
cuse.select('age').rdd.countByValue()
# cuse.select('education').rdd.countByValue()

defaultdict(int,
            {Row(age='<25'): 397,
             Row(age='25-29'): 404,
             Row(age='30-39'): 612,
             Row(age='40-49'): 194})

In [42]:
# string index each categorical string columns
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
indexers = [StringIndexer(inputCol=column, outputCol="indexed_"+column) for column in ('age', 'education', 'wantsMore')]
pipeline = Pipeline(stages=indexers)
indexed_cuse = pipeline.fit(cuse).transform(cuse)
indexed_cuse.select('age', 'indexed_age').distinct().show(5)

+-----+-----------+
|  age|indexed_age|
+-----+-----------+
|30-39|        0.0|
|  <25|        2.0|
|25-29|        1.0|
|40-49|        3.0|
+-----+-----------+



In [43]:
# onehotencode each indexed categorical columns
from pyspark.ml.feature import OneHotEncoder
columns = indexed_cuse.columns[0:3]
onehoteencoders = [OneHotEncoder(inputCol="indexed_"+column, outputCol="onehotencode_"+column) for column in columns]
pipeline = Pipeline(stages=onehoteencoders)
onehotencode_columns = ['onehotencode_age', 'onehotencode_education', 'onehotencode_wantsMore', 'y']
onehotencode_cuse = pipeline.fit(indexed_cuse).transform(indexed_cuse).select(onehotencode_columns)
onehotencode_cuse.distinct().show(5)

+----------------+----------------------+----------------------+---+
|onehotencode_age|onehotencode_education|onehotencode_wantsMore|  y|
+----------------+----------------------+----------------------+---+
|   (3,[1],[1.0])|             (1,[],[])|         (1,[0],[1.0])|  0|
|   (3,[2],[1.0])|         (1,[0],[1.0])|             (1,[],[])|  1|
|   (3,[0],[1.0])|         (1,[0],[1.0])|         (1,[0],[1.0])|  0|
|       (3,[],[])|         (1,[0],[1.0])|         (1,[0],[1.0])|  1|
|   (3,[2],[1.0])|             (1,[],[])|         (1,[0],[1.0])|  0|
+----------------+----------------------+----------------------+---+
only showing top 5 rows


In [44]:
# assemble all feature columns into on single vector column
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['onehotencode_age', 'onehotencode_education', 'onehotencode_wantsMore'], outputCol='features')
cuse_df_2 = assembler.transform(onehotencode_cuse).withColumnRenamed('y', 'label')
cuse_df_2.show(5)

+----------------+----------------------+----------------------+-----+-------------------+
|onehotencode_age|onehotencode_education|onehotencode_wantsMore|label|           features|
+----------------+----------------------+----------------------+-----+-------------------+
|   (3,[2],[1.0])|             (1,[],[])|         (1,[0],[1.0])|    0|(5,[2,4],[1.0,1.0])|
|   (3,[2],[1.0])|             (1,[],[])|         (1,[0],[1.0])|    0|(5,[2,4],[1.0,1.0])|
|   (3,[2],[1.0])|             (1,[],[])|         (1,[0],[1.0])|    0|(5,[2,4],[1.0,1.0])|
|   (3,[2],[1.0])|             (1,[],[])|         (1,[0],[1.0])|    0|(5,[2,4],[1.0,1.0])|
|   (3,[2],[1.0])|             (1,[],[])|         (1,[0],[1.0])|    0|(5,[2,4],[1.0,1.0])|
+----------------+----------------------+----------------------+-----+-------------------+
only showing top 5 rows


In [45]:
test.show(5)

+----------+------------+---------------+-------------+---------------+--------------+---------------+--------------------+--------+--------------------+-------------------+-----------+-----------+--------+-------------------+-----------+-----------+-----+----------+----------+
|      date|inside_sales|reserved_covers|walkin_covers|waitlist_covers|no_show_covers|no_show_parties|apparent_temperature|humidity|precip_intensity_max|    precip_max_time|precip_prob|precip_type|pressure|            summary|temperature|day_of_week|month|is_weekend|   date_ts|
+----------+------------+---------------+-------------+---------------+--------------+---------------+--------------------+--------+--------------------+-------------------+-----------+-----------+--------+-------------------+-----------+-----------+-----+----------+----------+
|2019-01-02|    12685.24|            132|            0|             51|             4|              1|               31.74|    0.61|                 0.0|2026-01-03

In [46]:
# split data into training and test datasets
training, test = cuse_df_2.randomSplit([0.8, 0.2], seed=1234)
training.show(5)

+----------------+----------------------+----------------------+-----+---------+
|onehotencode_age|onehotencode_education|onehotencode_wantsMore|label| features|
+----------------+----------------------+----------------------+-----+---------+
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|
+----------------+----------------------+----------------------+-----+---------+
only showing top 5 rows


In [47]:
## ======= build cross validation model ===========

# estimator
from pyspark.ml.regression import GeneralizedLinearRegression
glm = GeneralizedLinearRegression(featuresCol='features', labelCol='label', family='binomial')

# parameter grid
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
    addGrid(glm.regParam, [0, 0.5, 1, 2, 4]).\
    build()

# evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')

# build cross-validation model
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=glm, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

In [48]:
# fit model
# cv_model = cv.fit(training)
cv_model = cv.fit(cuse_df_2)

In [49]:
type(cv_model)

In [50]:
# prediction
pred_training_cv = cv_model.transform(training)
pred_test_cv = cv_model.transform(test)

pred_training_cv.show(5)
pred_test_cv.show(5, truncate=False)

+----------------+----------------------+----------------------+-----+---------+------------------+
|onehotencode_age|onehotencode_education|onehotencode_wantsMore|label| features|        prediction|
+----------------+----------------------+----------------------+-----+---------+------------------+
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|0.5140024065151407|
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|0.5140024065151407|
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|0.5140024065151407|
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|0.5140024065151407|
|       (3,[],[])|             (1,[],[])|             (1,[],[])|    0|(5,[],[])|0.5140024065151407|
+----------------+----------------------+----------------------+-----+---------+------------------+
only showing top 5 rows
+----------------+----------------------+----------------------+-----+------

In [51]:
cv_model.bestModel.coefficients

DenseVector([-0.2806, -0.7999, -1.1892, 0.325, -0.833])

In [52]:
cv_model.bestModel.intercept

0.05602427516928616

In [53]:
evaluator.evaluate(pred_training_cv)

0.6716478245974649

In [54]:
evaluator.evaluate(pred_test_cv)

0.6830864197530864

### Exercise

1. Do the generalized regression to forecast the `inside_sale` of this data: https://github.com/maks-p/restaurant_sales_forecasting/blob/master/csv/CSV_for_EDA_NEW.csv

2. Wrap your code in a pipeline as a Python class

In [69]:
df = spark.read.csv('./CSV_for_EDA_NEW.csv', header=True, inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- date: date (nullable = true)
 |-- inside_sales: double (nullable = true)
 |-- outside_sales: double (nullable = true)
 |-- inside_covers: integer (nullable = true)
 |-- outside_covers: integer (nullable = true)
 |-- reserved_covers: integer (nullable = true)
 |-- walkin_covers: integer (nullable = true)
 |-- waitlist_covers: integer (nullable = true)
 |-- no_show_covers: integer (nullable = true)
 |-- no_show_parties: integer (nullable = true)
 |-- apparent_temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- precip_intensity_max: double (nullable = true)
 |-- precip_max_time: timestamp (nullable = true)
 |-- precip_prob: double (nullable = true)
 |-- precip_type: string (nullable = true)
 |-- pressure: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- temperature: double (nullable = true)

+----------+------------+-------------+-------------+--------------+---------------+-------------+---------------+--------------+----------

In [70]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

w = Window.orderBy("date")

df = (
    df
    .withColumn("inside_sales_lag_1", lag("inside_sales", 1).over(w))
    .withColumn("inside_sales_lag_7", lag("inside_sales", 7).over(w))
    .withColumn("inside_sales_lag_14", lag("inside_sales", 14).over(w))
)

df = df.dropna(subset=[
    "inside_sales_lag_1",
    "inside_sales_lag_7",
    "inside_sales_lag_14"
])


In [71]:
from pyspark.sql.functions import avg
from pyspark.sql.window import Window

w = Window.orderBy("date")

df = (
    df
    .withColumn("sales_ma_7", avg("inside_sales").over(w.rowsBetween(-7, -1)))
    .withColumn("sales_ma_14", avg("inside_sales").over(w.rowsBetween(-14, -1)))
    .withColumn("sales_ma_28", avg("inside_sales").over(w.rowsBetween(-28, -1)))
)

df = df.dropna(subset=["sales_ma_28"])


In [74]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, dayofweek, month, year
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer,
    OneHotEncoder,
    VectorAssembler,
    StandardScaler
)
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


class InsideSalesGLMPipeline:
    """
    Generalized Linear Regression pipeline for forecasting inside_sales.
    """

    def __init__(self):
        self.model = None
        self.pipeline = None

    def _feature_engineering(self, df: DataFrame) -> DataFrame:
        """
        Create time-based features from date.
        """
        return (
            df
            .withColumn("day_of_week", dayofweek(col("date")))
            .withColumn("month", month(col("date")))
            .withColumn("year", year(col("date")))
        )

    def build_pipeline(self) -> Pipeline:
        """
        Define Spark ML Pipeline.
        """

        categorical_cols = ["precip_type", "summary"]
        numeric_cols = [
            "outside_sales",
            "inside_covers",
            "outside_covers",
            "reserved_covers",
            "walkin_covers",
            "waitlist_covers",
            "no_show_covers",
            "no_show_parties",
            "apparent_temperature",
            "humidity",
            "precip_intensity_max",
            "precip_prob",
            "pressure",
            "temperature",
            "day_of_week",
            "month",
            "year",
            "inside_sales_lag_1",
            "inside_sales_lag_7",
            "inside_sales_lag_14",
            "sales_ma_7",
            "sales_ma_14",
            "sales_ma_28"
        ]

        indexers = [
            StringIndexer(
                inputCol=c,
                outputCol=f"{c}_idx",
                handleInvalid="keep"
            )
            for c in categorical_cols
        ]

        encoders = [
            OneHotEncoder(
                inputCol=f"{c}_idx",
                outputCol=f"{c}_ohe"
            )
            for c in categorical_cols
        ]

        assembler = VectorAssembler(
            inputCols=numeric_cols + [f"{c}_ohe" for c in categorical_cols],
            outputCol="raw_features",
            handleInvalid="keep"
        )

        scaler = StandardScaler(
            inputCol="raw_features",
            outputCol="features",
            withMean=True,
            withStd=True
        )

        glm = GeneralizedLinearRegression(
            featuresCol="features",
            labelCol="inside_sales",
            family="gaussian",
            link="identity",
            maxIter=100,
            regParam=0.0
        )

        self.pipeline = Pipeline(stages=indexers + encoders + [
            assembler,
            scaler,
            glm
        ])

        return self.pipeline

    def fit(self, df: DataFrame):
        """
        Train GLM model.
        """
        df = self._feature_engineering(df)
        pipeline = self.build_pipeline()
        self.model = pipeline.fit(df)
        return self

    def predict(self, df: DataFrame) -> DataFrame:
        """
        Generate predictions.
        """
        if not self.model:
            raise ValueError("Model has not been trained")

        df = self._feature_engineering(df)
        return self.model.transform(df)

    def evaluate(self, df: DataFrame) -> dict:
        """
        Evaluate model performance.
        """
        preds = self.predict(df)

        evaluator_rmse = RegressionEvaluator(
            labelCol="inside_sales",
            predictionCol="prediction",
            metricName="rmse"
        )

        evaluator_r2 = RegressionEvaluator(
            labelCol="inside_sales",
            predictionCol="prediction",
            metricName="r2"
        )

        return {
            "RMSE": evaluator_rmse.evaluate(preds),
            "R2": evaluator_r2.evaluate(preds)
        }


In [75]:
# Train / test split
train_df = df.filter(col("date") < "2018-01-01")
test_df  = df.filter(col("date") >= "2018-01-01")

# Train model
glm_pipeline = InsideSalesGLMPipeline()
glm_pipeline.fit(train_df)

# Evaluate
metrics = glm_pipeline.evaluate(test_df)
print(metrics)

# Predict
predictions = glm_pipeline.predict(test_df)
predictions.select("date", "inside_sales", "prediction").show(10)


{'RMSE': 2131.825719290717, 'R2': 0.21097169153599182}
+----------+------------+------------------+
|      date|inside_sales|        prediction|
+----------+------------+------------------+
|2018-01-01|       381.0|6165.9187488975385|
|2018-01-02|    11591.75|11927.472049284357|
|2018-01-03|    12052.11|13228.789365408986|
|2018-01-04|    12296.98|13139.538909152223|
|2018-01-05|    15831.14|15273.228230431521|
|2018-01-06|    17706.94|16612.414951696825|
|2018-01-07|    12013.59|14324.813374743033|
|2018-01-08|    10950.13|13210.739667998238|
|2018-01-09|    13713.21|13488.544072966377|
|2018-01-10|    13153.22|14691.720213903463|
+----------+------------+------------------+
only showing top 10 rows
