# Libraries and Authorisations

## Libraries

In [225]:
import pandas as pd
import numpy as np

# Extract from sources and Prepare data

# 📋 Import `packaging_weight` data from Google Sheet

> The original pipeline extracted this data from a Google Sheet, allowing the stakeholders to interactively update the source of data, otherwise missing from any certified table in the Data Warehouse.

In [226]:
# ---- Load demand data in a dataframe # packaging_weight_pd
file_path = '/Users/fil/Documents/my_projects/packaging_licencing_fee/datasets/packaging_weight_pd_2024_11_20.csv'
packaging_weight_pd = pd.read_csv(file_path) # Execute the query and load the result into a pandas DataFrame

## Analyze dataframe

In [227]:
#describe the dataframe
packaging_weight_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4972 entries, 0 to 4971
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sku                4972 non-null   object 
 1   has_packaging      4971 non-null   object 
 2   category           4504 non-null   object 
 3   unit_weight_grams  4451 non-null   float64
 4   packaging_type     3701 non-null   object 
dtypes: float64(1), object(4)
memory usage: 194.3+ KB


In [228]:
# show the head
display(packaging_weight_pd.head())

Unnamed: 0,sku,has_packaging,category,unit_weight_grams,packaging_type
0,alias-49384,yes,paper,0.6,label
1,alias-49384,yes,plastic,26.0,thermoform
2,alias-79751,yes,paper,0.6,label
3,alias-79751,yes,paper,0.6,label
4,alias-79751,yes,plastic,8.0,flowpack


## 🧹 Clean data

The following operations should happen to consider the dataset clean:
1. We want the `has_packaging` field to be **boolean** type. Also, `NULL` should be converted to **False** values.
2. Coherent behaviour: if `has_packaging` is **False**, then the unit_weight_grams should be zero, '0' (because it means that if an article has no packaging, then its pachaging weight is zero grams).
3. We need categories, such as `paper`, `plastic` or `glass` to be mapped and cleaned
4. Finally we want to delete duplicate entries according to a key

In [229]:
# create a new DataFrame with the converted types
packaging_weight_pd_conv = packaging_weight_pd

### 1. Field `has_packaging` boolean conversion

In [230]:
# Use a mapping to convert text strings to boolean-like values
packaging_weight_pd_conv['has_packaging'] = packaging_weight_pd_conv['has_packaging'].map({
    'yes': True,
    'no': False
})

# Convert the object type to boolean for easier handling
packaging_weight_pd_conv['has_packaging'] = packaging_weight_pd_conv['has_packaging'].astype(bool)

# Fill missing values with False if applicable:
packaging_weight_pd_conv['has_packaging'] = packaging_weight_pd_conv['has_packaging'].fillna(False)

# check types
print(packaging_weight_pd_conv['has_packaging'].dtypes)

print(packaging_weight_pd_conv['has_packaging'].value_counts(normalize = True))

bool
has_packaging
True     0.860619
False    0.139381
Name: proportion, dtype: float64


### 2. Create a new `unit_weight_grams` with zeroes (0) for all the `has_packaging` False lines

In [251]:
# Create a new column 'unit_weight_grams_clean' with the logic: if the has_packaging is False, then the weight is 0
packaging_weight_pd_conv['unit_weight_grams_clean'] = packaging_weight_pd_conv.apply(
    lambda row: 0 if not row['has_packaging'] else row['unit_weight_grams'],
    axis=1
)

### 3. Clean Packaging Categories

In [252]:
# create a new category column that contains only the values (paper, plastic, mixed, tetrapak, metal, glass) else "unmapped"
# List of allowed categories
valid_categories = ['paper', 'plastic', 'mixed', 'tetrapak', 'metal', 'glass']

# Create a new column 'category_clean' with the specified logic
packaging_weight_pd['category_clean'] = packaging_weight_pd['category'].apply(
    lambda x: x if x in valid_categories else 'unmapped'
)

### 4. Drop duplicate keys: `sku`-`has_packaging`-`category_clean`

In [253]:
# Remove duplicates based on the combination of 'sku', 'has_packaging', and 'category'
# Keep the first occurrence of each combination
packaging_weight_unique = packaging_weight_pd_conv.drop_duplicates(subset=['sku', 'has_packaging', 'category_clean'], keep='first')

# Count the dropped lines
print("Dropped lines: " + str(len(packaging_weight_pd_conv) - len(packaging_weight_unique)))
print("Kept lines: " + str(len(packaging_weight_unique)))

Dropped lines: 932
Kept lines: 4040


### -> Create the final dataframe

In [254]:
# define columns of final dataframe
final_packaging_weight = packaging_weight_unique[['sku', 'has_packaging', 'category_clean', 'unit_weight_grams_clean']]
display(final_packaging_weight.head(5))

Unnamed: 0,sku,has_packaging,category_clean,unit_weight_grams_clean
0,alias-49384,True,paper,0.6
1,alias-49384,True,plastic,26.0
2,alias-79751,True,paper,0.6
4,alias-79751,True,plastic,8.0
6,alias-69325,True,paper,0.6


# 📤 Calculate the Bill of Materials for `delivered_materials`

## Get data from PDL source via SQL

> The original pipeline was extracting data from the live tables in the DWH. The SQL query for this extraction has been published [here](https://github.com/werderame/werderame.github.io/blob/main/portfolio-projects/packaging_licencing_fee/notebooks/delivered_skus_df.sql) for reference, and cleaned of all sensitive information.

In [255]:
# ---- Load demand data in a dataframe # delivered_skus_df
file_path = '/Users/fil/Documents/my_projects/packaging_licencing_fee/datasets/delivered_skus_df_2024_11_20.csv'
delivered_skus_df = pd.read_csv(file_path) # Execute the query and load the result into a pandas DataFrame

In [256]:
delivered_skus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64328 entries, 0 to 64327
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   dc                            64328 non-null  object
 1   hf_month                      64328 non-null  object
 2   hf_week                       64328 non-null  object
 3   delivery_date                 64328 non-null  object
 4   source                        64328 non-null  object
 5   destination                   64328 non-null  object
 6   sku                           64328 non-null  object
 7   sku_category                  64328 non-null  object
 8   randomized_delivery_quantity  64328 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 4.4+ MB


## 🧹 Clean and Prepare Data

The following data needs preparation:
1. all **object** columns should replace empty values with **NULL**
2. By knowing the HelloFresh operation, during discovery phase we noticed phase that the original source `delivered_skus_df` is lacking critical information for the final product, in particular: it is missing rows pertaining some of the materials being delivered to the customers. This is a shortcoming of the source which we want to address in the current section of the pipelind. The following items are missing from the source: <ins>Labels</ins> and <ins>Box Lids</ins> and represent secondary and thertiary packaging materials, respectively. We therefore:
   - Add <ins>labels</ins> (also called meal-kit, or MK Labels) using the logic: for every row of 'mk_bag_data' add 1 meal-kit label
   - Add <ins>Box Lids</ins> using the logic: each box has 1 lid. Notice that differently sized boxes have the same lid shape, here.

### 1. Fields conversions

In [257]:
# Replace all empty string values with strings in the dataframe
string_columns = delivered_skus_df.select_dtypes(include=['object', 'string']).columns # Identify all string columns in the DataFrame
delivered_skus_df[string_columns] = delivered_skus_df[string_columns].replace("", np.nan) # Replace empty strings with null

# Convert to date_time the `delivery_date`
delivered_skus_df['delivery_date'] = pd.to_datetime(delivered_skus_df['delivery_date'])

### 2. Add missing lines

#### 2.1 Create MK lable lines

In [258]:
# List of SKUs that require additional lines
mk_label_target_lines = ['mk_bag_data']

# Filter rows with target SKUs
filtered_mk_label_df = delivered_skus_df[delivered_skus_df['source'].isin(mk_label_target_lines)]

# Create the new rows by modifying the filtered rows
mk_label_new_lines = filtered_mk_label_df.assign(
    sku='alias-33207',
    source='later_manipulation'
)

# Filter the DataFrame for the specific SKU
filtered_df = mk_label_new_lines[mk_label_new_lines['sku'] == 'alias-33207']

# Sum the delivery_quantity column, handling nulls automatically
total_quantity = filtered_df['randomized_delivery_quantity'].sum()

# Display the total sum
print(f"Total delivery_quantity for alias-33207: {total_quantity}")

Total delivery_quantity for alias-33207: 5430978


#### 2.2 Create BOX lid lines

In [259]:
# List of SKUs that require additional lines
box_lid_target_lines = ['alias-24862', 'alias-40634', 'alias-38167', 'alias-39686']

# Filter rows with target SKUs
filtered_box_lid_df = delivered_skus_df[delivered_skus_df['sku'].isin(box_lid_target_lines)]

# Create the new rows by modifying the filtered rows
box_lid_new_lines = filtered_box_lid_df.assign(
    sku='alias-98912',
    source='later_manipulation'
)

# Filter the DataFrame for the specific SKU
filtered_df_2 = box_lid_new_lines[box_lid_new_lines['sku'] == 'alias-98912']

# Sum the delivery_quantity column, handling nulls automatically
total_quantity_2 = filtered_df_2['randomized_delivery_quantity'].sum()

# Display the total sum
print(f"Total delivery_quantity for alias-98912: {total_quantity_2}")

Total delivery_quantity for alias-98912: 901250


#### -> Append created lines

In [260]:
# Combine the original DataFrame with the new rows from mk_label_new_lines
manipulated_step1_delivered_skus_df = pd.concat([delivered_skus_df, mk_label_new_lines], ignore_index=True)

# Combine the result with the new rows from box_lid_new_lines
manipulated_step2_delivered_skus_df = pd.concat([manipulated_step1_delivered_skus_df, box_lid_new_lines], ignore_index=True)

In [261]:
# Display schema-like information for the DataFrame
print(manipulated_step2_delivered_skus_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64982 entries, 0 to 64981
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   dc                            64982 non-null  object        
 1   hf_month                      64982 non-null  object        
 2   hf_week                       64982 non-null  object        
 3   delivery_date                 64982 non-null  datetime64[ns]
 4   source                        64982 non-null  object        
 5   destination                   64982 non-null  object        
 6   sku                           64982 non-null  object        
 7   sku_category                  64982 non-null  object        
 8   randomized_delivery_quantity  64982 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 4.5+ MB
None


# 📤 Calculate the Bill of Materials for `disposals`

> The original pipeline was extracting data from the live tables in the DWH. The SQL query for this extraction has been published [here](https://github.com/werderame/werderame.github.io/blob/main/portfolio-projects/packaging_licencing_fee/notebooks/disposed_skus_df.sql) for reference, and cleaned of all sensitive information.

## Get data from DWH source via SQL

In [262]:
# ---- Load demand data in a dataframe # delivered_skus_df
file_path = '/Users/fil/Documents/my_projects/packaging_licencing_fee/datasets/disposed_skus_df_2024_11_20.csv'
disposed_skus_df = pd.read_csv(file_path) # Execute the query and load the result into a pandas DataFrame

In [263]:
disposed_skus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5845 entries, 0 to 5844
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   dc                            5845 non-null   object
 1   hf_month                      5845 non-null   object
 2   hf_week                       5845 non-null   object
 3   delivery_date                 5845 non-null   object
 4   source                        5845 non-null   object
 5   destination                   5845 non-null   object
 6   sku                           5845 non-null   object
 7   sku_category                  5845 non-null   object
 8   randomized_delivery_quantity  5845 non-null   int64 
dtypes: int64(1), object(8)
memory usage: 411.1+ KB


> Apart from the data type of the `delivery_date` this dataset is clean.

## 🧹 Clean and Prepare Data

In [264]:
disposed_skus_df['delivery_date'] = pd.to_datetime(disposed_skus_df['delivery_date'])
print(disposed_skus_df['delivery_date'].dtype)

datetime64[ns]


# 🔀 Final Data Preparation Steps: Merge Dataframes

## Merge the `Delivered` and `Disposed` Materials = `total_deliveries`

In [265]:
# Combine the DataFrames using pandas concat
total_delivered_skus_df = pd.concat([disposed_skus_df, manipulated_step2_delivered_skus_df], ignore_index=True)

## Merge the `total_deliveries` and `weights` dataframes

In [266]:
# Perform a left join on the two DataFrames using the 'sku' column as the key
delivered_weight_pd = total_delivered_skus_df.merge(final_packaging_weight, on='sku', how='left')

# Add the calculated field of `delivered_weight` as the multiplication of quantities and weight per unit
delivered_weight_pd['delivered_weight_grams'] = delivered_weight_pd['unit_weight_grams_clean'] * delivered_weight_pd['randomized_delivery_quantity']

In [267]:
display(delivered_weight_pd.head())

Unnamed: 0,dc,hf_month,hf_week,delivery_date,source,destination,sku,sku_category,randomized_delivery_quantity,has_packaging,category_clean,unit_weight_grams_clean,delivered_weight_grams
0,FI,2024-08,2024-W33,2024-08-10,overkitting_waste,donation,alias-35165,C_05,408,True,plastic,1.35,550.8
1,FI,2024-08,2024-W33,2024-08-10,overkitting_waste,donation,alias-92488,C_08,62,True,mixed,1.2,74.4
2,FI,2024-08,2024-W34,2024-08-17,overkitting_waste,donation,alias-11082,C_06,218,True,plastic,1.0,218.0
3,PI,2024-08,2024-W34,2024-08-17,overkitting_waste,donation,alias-56415,C_08,17,True,paper,1.386,23.562
4,FI,2024-08,2024-W34,2024-08-17,overkitting_waste,donation,alias-62890,C_07,26,True,paper,8.0,208.0


# ☁️ Load the resulting `delivered_weight` to the Data Warehouse

## Push to Databricks

> Finally, in the original pipeline we would push the final dataframe to the Data Warehouse, allowing stakeholders to query the data, analyze it and pull it into Tableau. Here, we are simply leaving the code as reference and showing the final result

In [268]:
# delivered_weight_spark_df.write.mode("overwrite").saveAsTable("<schema_name>.dach_delivered_packaging_weight")

## Push missing skus to file

> in the original pipeline we were pushing the 'missing skus' dataframe to the Google sheet containing the packaging weight, allowing the stakeholders to interactively update the packaging weight list. Such list, which we pull at the start of this pipeline, would thus be captured with the next run of the job and would complete the calculation of delivered packaging weight.

## Calculate the list of SKUs missing from the packaging_weight Google table

In [269]:
# create a df with the missing information
missing_weight_skus = delivered_weight_pd[delivered_weight_pd['delivered_weight_grams'].isnull()]['sku']
missing_weight_skus = missing_weight_skus.drop_duplicates()
missing_weight_skus = missing_weight_skus.sort_values()
print("missing skus: " + str(len(missing_weight_skus)))

missing skus: 42


In [270]:
display(missing_weight_skus.head(10))

25000    alias-11889
4767     alias-14593
15125    alias-16817
11540    alias-18215
11631    alias-24827
12982    alias-26040
6366     alias-33764
11542    alias-35256
22339    alias-39060
15113    alias-39530
Name: sku, dtype: object

end of the pipeline