# Databricks notebook source

Generates a refined table with processed data for forecast analysis and modeling


## 1. Libraries

In [None]:
import pandas as pd
import numpy as np


## 2. Data Loading

In [None]:
raw_data_path = "/dbfs/mnt/datalake/datascience/raw/sales_forecast/tbt_sales_orders/sales_orders_raw.parquet"
df_raw_sales = pd.read_parquet(raw_data_path)

In [None]:
df_raw_sales.info()

In [None]:
df_raw_sales.isnull().sum()

In [None]:
df_raw_sales.head()


## 3. Data Cleaning and Transformation

In [None]:
# Dropping null values as they are currently irrelevant
df_raw_sales.dropna(subset=['GROSS_VALUE'], axis=0, inplace=True)
df_raw_sales.isnull().sum()

In [None]:
# Stripping whitespaces from BUSINESS_UNIT column
df_raw_sales['BUSINESS_UNIT'] = df_raw_sales.BUSINESS_UNIT.str.strip()
df_raw_sales['BUSINESS_UNIT'].unique()

In [None]:
# Adjusting GROSS_VALUE format
df_raw_sales['GROSS_VALUE'] = df_raw_sales['GROSS_VALUE'].astype('float')

In [None]:
# Adjusting SYSTEM_TIMESTAMP format to date hour minute
df_raw_sales['SYSTEM_TIMESTAMP'] = pd.to_datetime(df_raw_sales['SYSTEM_TIMESTAMP'].dt.strftime('%Y-%m-%d %H:%M'))

In [None]:
df_raw_sales.head()

In [None]:
# Removing PERIOD_DATE column as it won't be needed
df_raw_sales.drop(columns=['PERIOD_DATE'], inplace=True)

In [None]:
# Calculating NET_VALUE
# Defining factors
factor_category_a = 0.231
factor_brands = 0.826

# Creating new NET_VALUE column based on condition
df_raw_sales['NET_VALUE'] = np.where(
                                    df_raw_sales['BUSINESS_UNIT'] == 'CATEGORY_A',
                                    df_raw_sales['GROSS_VALUE'] * factor_category_a,
                                    df_raw_sales['GROSS_VALUE'] * factor_brands)

In [None]:
# Grouping by SYSTEM_TIMESTAMP and keeping necessary columns
df_raw_sales = df_raw_sales.groupby(by='SYSTEM_TIMESTAMP').sum()[['NET_VALUE']]
df_raw_sales.reset_index(inplace=True)

In [None]:
# Checking data
df_raw_sales.info()

In [None]:
df_raw_sales.describe()

In [None]:
df_raw_sales.tail()

In [None]:
display(df_raw_sales.set_index('SYSTEM_TIMESTAMP').resample('30T').sum().reset_index()[-500:])


## 4. Saving data to Delta table

In [None]:
# Saving data
df_spark = spark.createDataFrame(df_raw_sales) 
mode = 'overwrite' # overwrite or append
overwriteSchema = 'True' # True or False
table_name = 'analytics.refined_sales_orders_agg'
path = '/dbfs/mnt/datalake/datascience/raw/sales_forecast/tbt_sales_orders_processed/'

df_spark.write.option("overwriteSchema", overwriteSchema).saveAsTable(table_name, 
                                                                      format='delta', 
                                                                      mode=mode,
                                                                      path=f'{path}tbt_sales_orders_processed')