In [1]:
import pyspark
from pyspark.sql import functions as f, Window
import pandas as pd
from kedro.pipeline import *
from kedro.io import *
from kedro.runner import *

import pickle
import os
from pyspark.sql import SparkSession, DataFrame, functions as f
from typing import Dict

In [2]:
def create_spark_session() -> None:
    """
    Placeholder function to create the spark session for a run.

    """
    SparkSession.builder.config("spark.driver.memory", "16g").config(
        "spark.executor.memory", "16g"
    ).config("spark.driver.maxResultSize", "8g").master("local[*]").config(
        "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"
    ).config(
        "spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.1.1,com.crealytics:spark-excel_2.11:0.11.1"
    ).config(
        "fs.s3a.access.key", ""
    ).config(
        "fs.s3a.secret.key", ""
    ).config(
        "fs.s3a.maxConnections", "5000"
    ).config(
        "spark.sql.execution.arrow.enabled", "true"
    ).config(
        "spark.debug.maxToStringFields", "100"
    ).config(
        "fs.s3a.connection.maximum", "5000"
    ).config(
        "spark.sql.shuffle.partitions", "8"
    ).config(
        "spark.sql.codegen.wholeStage", "false"
    ).appName(
        "comm-analytics"
    ).getOrCreate()

In [3]:
create_spark_session()
spark = SparkSession.builder.getOrCreate()

In [4]:
from pyspark.sql.types import StringType, DateType, StructField, StructType, IntegerType, DoubleType

In [15]:
data = [
        ["1", "A", "AAA", 3.1],
        ["1", "A", "BBB", 2.4],
        ["1", "A", "CCC", 10.34],
        ["2", "B", "AAA", 56.45],
        ["2", "B", "BBB", 33.44],
        ["2", "B", "CCC", 99.23],
        ["3", "C", "AAA", 37.56],
        ["3", "C", "BBB", 86.89],
        ["3", "C", "CCC", 23.89]
    ]

schema = StructType(
    [
        StructField("id", StringType()),
        StructField("type", StringType()),
        StructField("date", StringType()),
        StructField("cost", DoubleType()),
    ]
)

df_data = spark.createDataFrame(data, schema=schema)

In [16]:
df_data.show()

+---+----+----+-----+
| id|type|date| cost|
+---+----+----+-----+
|  1|   A| AAA|  3.1|
|  1|   A| BBB|  2.4|
|  1|   A| CCC|10.34|
|  2|   B| AAA|56.45|
|  2|   B| BBB|33.44|
|  2|   B| CCC|99.23|
|  3|   C| AAA|37.56|
|  3|   C| BBB|86.89|
|  3|   C| CCC|23.89|
+---+----+----+-----+



In [17]:
df_data.groupby(df_data.id, df_data.type).pivot("date").avg("cost").show()

+---+----+-----+-----+-----+
| id|type|  AAA|  BBB|  CCC|
+---+----+-----+-----+-----+
|  2|   B|56.45|33.44|99.23|
|  1|   A|  3.1|  2.4|10.34|
|  3|   C|37.56|86.89|23.89|
+---+----+-----+-----+-----+



In [18]:
data1 = [
        ["A", "AAA", 3.1],
        ["A", "BBB", 2.4],
        ["A", "CCC", 10.34],
        ["B", "AAA", 56.45],
        ["B", "BBB", 33.44],
        ["B", "CCC", 99.23],
        ["C", "AAA", 37.56],
        ["C", "BBB", 86.89],
        ["C", "CCC", 23.89]
    ]

schema1 = StructType(
    [        
        StructField("region_key", StringType()),
        StructField("rabatt_status", StringType()),
        StructField("vos_pct", DoubleType()),
    ]
)

df_data1 = spark.createDataFrame(data1, schema=schema1)

In [20]:
df_data1.groupby(df_data1.region_key).pivot("rabatt_status").avg("vos_pct")\
        .select("region_key", "AAA", "BBB")\
        .withColumnRenamed("AAA", "national_vimpat_sales")\
        .withColumnRenamed("BBB", "discounted_sales").show()

+----------+---------------------+----------------+
|region_key|national_vimpat_sales|discounted_sales|
+----------+---------------------+----------------+
|         B|                56.45|           33.44|
|         C|                37.56|           86.89|
|         A|                  3.1|             2.4|
+----------+---------------------+----------------+

