# Data cleaning

## Libraries

In [1]:
pip install fastexcel

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import polars as pl
import re


  from pandas.core import (


## Global variables

In [3]:
PATH_DENSITY_REPORT       = 'DensityReports.xlsx'
PATH_HISTORICAL_INCIDENTS = 'HistoricalIncidents.xlsx'
PATH_PRODUCT_ATTRIBUTES   = 'ProductAttributes.xlsx'
PATH_SUPPLIER_SCORECARD   = 'SupplierScorecard.xlsx'

EXPORT_DENSITY_REPORT       = 'density_report.csv'
EXPORT_HISTORICAL_INCIDENTS = 'historical_incidents.csv'
EXPORT_PRODUCT_ATTRIBUTES   = 'product_attributes.csv'
EXPORT_SUPPLIER_SCORECARD   = 'supplier_scorecard.csv'

## Global functions

In [5]:
def polars_read_excel(file_name, sheet_name='Sheet1'):
  return pl.read_excel(source=file_name, sheet_name=sheet_name)

In [6]:
# create a function to read the excel files
df_density_report = polars_read_excel(PATH_DENSITY_REPORT)
df_historical_incidents = polars_read_excel(PATH_HISTORICAL_INCIDENTS)
df_product_attributes = polars_read_excel(PATH_PRODUCT_ATTRIBUTES)
df_supplier_scorecard = polars_read_excel(PATH_SUPPLIER_SCORECARD)

## Execution

## 1. Density Report

### `Product Reference` issues

In [7]:
def clean_product_reference(series: pl.Series) -> pl.Series:
    """
    Cleans ProductReference: removes trailing 'X', validates 'PRD'+5 digits format.
    Returns cleaned string if valid, otherwise None (null).
    """
    series_str = series.cast(pl.Utf8, strict=False)
    cleaned_series = series_str.str.strip_chars_end('X')
    valid_pattern = r"^PRD\d{5}$"
    final_series = pl.when(cleaned_series.str.contains(valid_pattern)) \
                     .then(cleaned_series) \
                     .otherwise(None)
    return final_series

In [8]:
df_density_report = df_density_report.with_columns(
    clean_product_reference(pl.col("ProductReference")).alias("ProductReference")
)
# Calculate and print null count
null_count_dr = df_density_report['ProductReference'].is_null().sum()
print(f"Null count in df_density_report['ProductReference']: {null_count_dr}")


# Apply cleaning to df_historical_incidents
df_historical_incidents = df_historical_incidents.with_columns(
    clean_product_reference(pl.col("ProductReference")).alias("ProductReference")
)
# Calculate and print null count
null_count_hi = df_historical_incidents['ProductReference'].is_null().sum()
print(f"Null count in df_historical_incidents['ProductReference']: {null_count_hi}")

Null count in df_density_report['ProductReference']: 7652
Null count in df_historical_incidents['ProductReference']: 301


### Naming consistency

We will check if there are  grammar errors in columns like `SupplierName`, `GarmentType`, `Material`, `ProposedFoldingMethod`, `ProposedLayout` and `PackagingQuality`to avoid repetition

In [9]:

columns_to_check = [
    'SupplierName',
    'GarmentType',
    'Material',
    'ProposedFoldingMethod',
    'ProposedLayout',
    'PackagingQuality']

print("Checking Unique Values for Potential Inconsistencies")

for col_name in columns_to_check:
    if col_name in df_density_report.columns:
        try:
            unique_values = (
                df_density_report[col_name]
                .unique()
                .sort()
            )

            print(f"Unique values in: {col_name}")
            print(unique_values.to_list())
            print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

        except Exception as e:
            print(f" Could not process column: {col_name} ")
            print(f"Error: {e}")
    else:
        print(f"Column not found: {col_name}")



Checking Unique Values for Potential Inconsistencies
Unique values in: SupplierName
['SPLF', 'SuplA', 'SupllierC', 'SuppB', 'SupplierA', 'SupplierB', 'SupplierC', 'SupplierD', 'SupplierE', 'SupplierF', 'SupplierG', 'SupplierH', 'supplierA', 'supplierh']
Total unique non-null values: 14
Unique values in: GarmentType
['Blouse', 'Coat', 'Dress', 'Hoodie', 'Jacket', 'Pants', 'Shirt', 'Shorts', 'Skirt', 'Suit', 'Sweater', 'T-Shirt']
Total unique non-null values: 12
Unique values in: Material
['Cotton', 'Denim', 'Linen', 'Polyester', 'Silk', 'Wool']
Total unique non-null values: 6
Unique values in: ProposedFoldingMethod
['FoldX', 'Methd1', 'Method1', 'Method2', 'Method3', 'Method_2', 'None']
Total unique non-null values: 7
Unique values in: ProposedLayout
['Box9', 'LayC', 'LayoutA', 'LayoutB', 'LayoutC', 'LayoutD', 'LayoutE', 'LayoutX', 'layouta']
Total unique non-null values: 9
Unique values in: PackagingQuality
['Bad', 'GOOD', 'Good', 'Uncertain', 'bad']
Total unique non-null values: 5


As we can see, we do have some errors in naming in columns `SupplierName`, `ProposedFoldingMethod`, `ProposedLayout` and `PackagingQuality`. So we need to fix these inconsistencies

We need to nsure that the `SupplierName` column is consistent across all relevant datasets (`df_density_report`, `df_historical_incidents`, `df_supplier_scorecard`). This involves correcting typos, standardizing capitalization and removing extra whitespace found during initial checks.

Instead of repeating the cleaning code for each datframe, we define the cleaning steps once. We then loop through the affected dataframes and apply this standardized cleaning logic to each one that contains the `SupplierName` column!


In [10]:
# 1. Define the mapping dictionary 
supplier_mapping = {
    'SuplA': 'SupplierA',
    'supplierA': 'SupplierA',
    'SuppB': 'SupplierB',
    'SupllierC': 'SupplierC',
    'SPLF': 'SupplierF',       
    'supplierh': 'SupplierH',
    
}

# 2. Define the cleaning expression 
def clean_supplier_name_expr():
  """Returns a Polars expression to clean the SupplierName column."""
  return (
      pl.col('SupplierName')
        .str.strip_chars()          
        .replace(supplier_mapping) 
        .str.replace_all(" ", "")    
        .alias('SupplierName')       
  )

# 3. Apply the cleaning expression to the DataFrames
dataframes_to_clean = [
    df_density_report,
    df_historical_incidents,
    df_supplier_scorecard
    ]

dataframe_dict = {
    "df_density_report": df_density_report,
    "df_historical_incidents": df_historical_incidents,
    "df_supplier_scorecard": df_supplier_scorecard
}


cleaned_dataframes = {} 
for i, df in enumerate(dataframes_to_clean): 
    df_name = f"DataFrame at index {i}" 
   

    if 'SupplierName' in df.columns:
        
        
        df_cleaned = df.with_columns(clean_supplier_name_expr())
        dataframes_to_clean[i] = df_cleaned 
        
        
    else:
        print(f" -> Skipping {df_name} (no 'SupplierName' column).")


df_density_report = dataframes_to_clean[0]
df_historical_incidents = dataframes_to_clean[1]
df_supplier_scorecard = dataframes_to_clean[2] 

# Check the unique values again
unique_values = (
    df_density_report['SupplierName']
    .unique()
    .sort()
)
print(f" Unique values in: SupplierName")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

 Unique values in: SupplierName
['SupplierA', 'SupplierB', 'SupplierC', 'SupplierD', 'SupplierE', 'SupplierF', 'SupplierG', 'SupplierH']
Total unique non-null values: 8


In [11]:
#fixing ProposedFoldingMethod
method_mapping = {
    'Methd1': 'Method1',
    'Method_2': 'Method2',
}

# Apply the mapping
df_density_report = df_density_report.with_columns(
    pl.col('ProposedFoldingMethod')
      .str.strip_chars()
      .replace(method_mapping)
      .str.replace_all(" ", "")
      .alias('ProposedFoldingMethod')
)

# Check the unique values again
unique_values = (
    df_density_report['ProposedFoldingMethod']
    .unique()
    .sort()
)
print(f" Unique values in: ProposedFoldingMethod")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

 Unique values in: ProposedFoldingMethod
['FoldX', 'Method1', 'Method2', 'Method3', 'None']
Total unique non-null values: 5


In [12]:
#fixing ProposedLayout
layout_mapping = {
    'layouta': 'LayoutA',
    'LayC': 'LayoutC',
}

# Apply the mapping
df_density_report = df_density_report.with_columns(
    pl.col('ProposedLayout')
      .str.strip_chars()
      .replace(layout_mapping)
      .str.replace_all(" ", "")
      .alias('ProposedLayout')
)

# Check the unique values again
unique_values = (
    df_density_report['ProposedLayout']
    .unique()
    .sort()
)
print(f"Unique values in: ProposedLayout")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

Unique values in: ProposedLayout
['Box9', 'LayoutA', 'LayoutB', 'LayoutC', 'LayoutD', 'LayoutE', 'LayoutX']
Total unique non-null values: 7


In [13]:
#fixing PackagingQuality
quality_mapping = {
    'GOOD': 'Good',
    'bad': 'Bad',
}

# Apply the mapping
df_density_report = df_density_report.with_columns(
    pl.col('PackagingQuality')
      .str.strip_chars()
      .replace(quality_mapping)
      .str.replace_all(" ", "")
      .alias('PackagingQuality')
)

# Check the unique values again
unique_values = (
    df_density_report['PackagingQuality']
    .unique()
    .sort()
)
print(f" Unique values in: PackagingQuality")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

 Unique values in: PackagingQuality
['Bad', 'Good', 'Uncertain']
Total unique non-null values: 3


### `Folding Method` nulls

In [14]:
# unique values in folding method and the amount of times they appear
folding_method_counts = (
    df_density_report['ProposedFoldingMethod']
    .value_counts()
    .sort('count', descending=True)
)
print(f"Folding Method Counts:")
print(folding_method_counts)

Folding Method Counts:
shape: (5, 2)
┌───────────────────────┬────────┐
│ ProposedFoldingMethod ┆ count  │
│ ---                   ┆ ---    │
│ str                   ┆ u32    │
╞═══════════════════════╪════════╡
│ Method2               ┆ 220595 │
│ Method1               ┆ 160102 │
│ Method3               ┆ 114363 │
│ None                  ┆ 2514   │
│ FoldX                 ┆ 2426   │
└───────────────────────┴────────┘


In [15]:
#Replace the string 'None' with actual null (Python None)
df_density_report = df_density_report.with_columns(
pl.when(pl.col('ProposedFoldingMethod') == 'None')
.then(None)
.otherwise(pl.col('ProposedFoldingMethod'))
.alias('ProposedFoldingMethod')
)

`None` is counted as a string, so we need to change it to `null`

In [16]:
#Replace the string 'None' with actual null (Python None)
df_density_report = df_density_report.with_columns(
pl.when(pl.col('ProposedFoldingMethod') == 'None')
.then(None)
.otherwise(pl.col('ProposedFoldingMethod'))
.alias('ProposedFoldingMethod')
)

Instead of just deleting these rows (and losing data) or picking a random method, we'll make an educated guess based on patterns in the existing data. The most logical assumption is that garments of a similar type, especially when intended for a specific packaging layout are likely folded using the same standard method.

In [17]:
# 1a. Enhanced Mode (GarmentType, Layout, Material)
mode_map_enhanced = (
    df_density_report
    .filter(
        pl.col('ProposedFoldingMethod').is_not_null() &
        pl.col('GarmentType').is_not_null() &
        pl.col('ProposedLayout').is_not_null() &
        pl.col('Material').is_not_null()
        )
    .group_by(['GarmentType', 'ProposedLayout', 'Material'])
    .agg(pl.col('ProposedFoldingMethod').mode().first().alias('Mode_Enhanced'))
)
print("- Enhanced mode calculated.")

# 1b. Combined Mode (GarmentType, Layout)
mode_map_combined = (
    df_density_report
    .filter(
        pl.col('ProposedFoldingMethod').is_not_null() &
        pl.col('GarmentType').is_not_null() &
        pl.col('ProposedLayout').is_not_null()
        )
    .group_by(['GarmentType', 'ProposedLayout'])
    .agg(pl.col('ProposedFoldingMethod').mode().first().alias('Mode_Combined'))
)
print("- Combined mode calculated.")

# 1c. Garment Mode (GarmentType only)
mode_map_garment = (
    df_density_report
    .filter(pl.col('ProposedFoldingMethod').is_not_null())
    .group_by('GarmentType')
    .agg(pl.col('ProposedFoldingMethod').mode().first().alias('Mode_Garment'))
)
print("- Garment mode calculated.")

# 1d. Global Mode Fallback
global_mode_folding = df_density_report.filter(
    pl.col('ProposedFoldingMethod').is_not_null()
)['ProposedFoldingMethod'].mode().first()
print(f"- Global mode calculated: {global_mode_folding}")

if global_mode_folding is None:
    print("Warning: Global folding mode is None. Setting a default.")
    # Set a reasonable default if necessary (e.g., overall most frequent)
    global_mode_folding = 'Method2' # Example default, adjust if needed

# --- Step 2: Create a Combined Lookup Table ---
print("\nCombining mode lookup tables...")

# Start with the most specific mode map
lookup_table = mode_map_enhanced

# Left join the next level fallback mode
lookup_table = lookup_table.join(
    mode_map_combined, on=['GarmentType', 'ProposedLayout'], how='left'
)

# Left join the next level fallback mode
lookup_table = lookup_table.join(
    mode_map_garment, on=['GarmentType'], how='left'
)
print("- Mode lookup tables combined.")
# print(lookup_table.head()) # Optional: view combined lookup

# --- Step 3: Perform ONE Join to the Main DataFrame ---
print("\nJoining combined lookup table to main DataFrame...")
df_with_modes = df_density_report.join(
    lookup_table,
    on=['GarmentType', 'ProposedLayout', 'Material'], # Join on the most specific keys
    how='left' # Keep all rows from df_density_report
)
print("- Join complete.")

# --- Step 4: Apply Coalesce using Joined Columns ---
print("\nApplying coalesce for imputation...")
df_final = df_with_modes.with_columns(
    pl.coalesce(
        pl.col('ProposedFoldingMethod'),   # 1. Original value (keeps non-nulls)
        pl.col('Mode_Enhanced'),           # 2. Mode from (Type, Layout, Material)
        pl.col('Mode_Combined'),           # 3. Mode from (Type, Layout)
        pl.col('Mode_Garment'),            # 4. Mode from (Type)
        pl.lit(global_mode_folding)        # 5. Global Mode
    ).alias('ProposedFoldingMethod') # Overwrite the original column directly
)

# --- Step 5: Drop temporary mode columns ---
# Drop the columns brought in by the join
df_final = df_final.drop(['Mode_Enhanced', 'Mode_Combined', 'Mode_Garment'])
print("- Imputation complete, temporary columns dropped.")


# --- Verification ---
final_null_count_folding = df_final['ProposedFoldingMethod'].is_null().sum()
print(f"\nNull count in ProposedFoldingMethod AFTER imputation: {final_null_count_folding}")

if final_null_count_folding == 0:
    print("Successfully imputed all original null values in ProposedFoldingMethod.")
    print("Final value counts:")
    print(df_final['ProposedFoldingMethod'].value_counts().sort("count", descending=True))
else:
    print(f"Warning: {final_null_count_folding} null values remain. Review mode calculations/fallbacks.")

# --- Assign back ---
df_density_report = df_final

- Enhanced mode calculated.
- Combined mode calculated.
- Garment mode calculated.
- Global mode calculated: Method2

Combining mode lookup tables...
- Mode lookup tables combined.

Joining combined lookup table to main DataFrame...
- Join complete.

Applying coalesce for imputation...
- Imputation complete, temporary columns dropped.

Null count in ProposedFoldingMethod AFTER imputation: 0
Successfully imputed all original null values in ProposedFoldingMethod.
Final value counts:
shape: (4, 2)
┌───────────────────────┬────────┐
│ ProposedFoldingMethod ┆ count  │
│ ---                   ┆ ---    │
│ str                   ┆ u32    │
╞═══════════════════════╪════════╡
│ Method2               ┆ 222021 │
│ Method1               ┆ 160668 │
│ Method3               ┆ 114885 │
│ FoldX                 ┆ 2426   │
└───────────────────────┴────────┘


### Outliers for `ProposedUnitsPerCarton`

We will check some strange values we found in:`ProposedUnitsPerCarton`. 

In [18]:
# check value distribution for ProposedUnitsPerCarton
df_density_report.select(
    pl.col('ProposedUnitsPerCarton')
).describe().to_pandas().T

Unnamed: 0,0,1,2,3,4,5,6,7,8
statistic,count,null_count,mean,std,min,25%,50%,75%,max
ProposedUnitsPerCarton,500000.0,0.0,99.981055,864.741016,-3.0,16.0,25.0,32.0,9999.0


We can see three very noticeable things in `ProposedUnitsPerCarton`. First, we have negative values, which should be impossible we also have some decimal values different than 0, something that should be impossible and finally a maximum value of 9999. We need to clean this situation. These are probably sentinel values, values that are designed to fill the data when unknown probably. We will replace those occurences with **null**

In [19]:
# Get original stats BEFORE any code cleaning
original_null_count = df_density_report['ProposedUnitsPerCarton'].is_null().sum()
original_min = df_density_report['ProposedUnitsPerCarton'].min()
original_max = df_density_report['ProposedUnitsPerCarton'].max()
print(f"- Null count before cleaning codes: {original_null_count}")
print(f"- Min value before cleaning codes: {original_min}") 
print(f"- Max value before cleaning codes: {original_max}") 

# Define the list of ALL known invalid code values to replace
invalid_codes = [-3.0, 9999.0, 12.5] 

# Get counts for verification (from previous checks)
count_neg_3 = 3754
count_9999 = 3786
count_12_5 = 3757
total_expected_increase = count_neg_3 + count_9999 + count_12_5 

print(f"- Codes to be replaced with null: {invalid_codes}")


# Replace all specific invalid codes 
df_density_report = df_density_report.with_columns(
    pl.when(pl.col('ProposedUnitsPerCarton').is_in(invalid_codes)) 
    .then(None)                                                  
    .otherwise(pl.col('ProposedUnitsPerCarton'))                 
    .alias('ProposedUnitsPerCarton')                            
)

# 1. Check the new null count
null_count_after = df_density_report['ProposedUnitsPerCarton'].is_null().sum()
print("\nVerifying replacement:")
print(f"- Null count in ProposedUnitsPerCarton AFTER cleaning codes: {null_count_after}")
if null_count_after == original_null_count + total_expected_increase:
    print(f"  >> Confirmation: Null count increased by {total_expected_increase} as expected.")
else:
    print(f"  >> Warning: Null count change ({null_count_after - original_null_count}) doesn't match expected ({total_expected_increase}). Please review.")

# 2. Check the minimum value (should no longer be negative)
min_after_cleaning = df_density_report['ProposedUnitsPerCarton'].min()
print(f"- Minimum value in ProposedUnitsPerCarton after cleaning: {min_after_cleaning}")

# 3. Check the maximum value (should no longer be 9999.0)
max_after_cleaning = df_density_report['ProposedUnitsPerCarton'].max()
print(f"- Maximum value in ProposedUnitsPerCarton after cleaning: {max_after_cleaning}")

# 4. Check if the specific codes remain (should be none)
remaining_codes = df_density_report.filter(pl.col('ProposedUnitsPerCarton').is_in(invalid_codes)).height
print(f"- Remaining count of codes {invalid_codes}: {remaining_codes}") 
if remaining_codes == 0:
     print("  >> Confirmation: Invalid codes successfully removed.")
else:
     print("  >> Warning: Some invalid code values still detected.")

# 5. Check if any other decimals remain (should now be 0)
remaining_decimals = df_density_report.filter(
    (pl.col('ProposedUnitsPerCarton').is_not_null()) &
    (pl.col('ProposedUnitsPerCarton') % 1 != 0)
).height
print(f"- Remaining count of non-integer values: {remaining_decimals}") 
if remaining_decimals == 0:
    print("  >> Confirmation: No non-integer values remain.")
else:
    print("  >> Warning: Non-integer values still detected (other than the codes).")

- Null count before cleaning codes: 0
- Min value before cleaning codes: -3.0
- Max value before cleaning codes: 9999.0
- Codes to be replaced with null: [-3.0, 9999.0, 12.5]

Verifying replacement:
- Null count in ProposedUnitsPerCarton AFTER cleaning codes: 11297
  >> Confirmation: Null count increased by 11297 as expected.
- Minimum value in ProposedUnitsPerCarton after cleaning: 0.0
- Maximum value in ProposedUnitsPerCarton after cleaning: 49.0
- Remaining count of codes [-3.0, 9999.0, 12.5]: 0
  >> Confirmation: Invalid codes successfully removed.
- Remaining count of non-integer values: 0
  >> Confirmation: No non-integer values remain.


Now we have **11,297** null values. We will follow a similar approach to inpute the nulls like we did before, finding the median for each garment type.

In [20]:
1. #Calculate median per GarmentType (filter out nulls first)
median_map_garment = (
    df_density_report.filter(pl.col('ProposedUnitsPerCarton').is_not_null()) # Use only non-null values for calculation
    .group_by('GarmentType')
    .agg(pl.median('ProposedUnitsPerCarton').alias('MedianUnits_Garment'))
)


# 2. Calculate global median from valid data
global_valid_median = df_density_report.filter(
    pl.col('ProposedUnitsPerCarton').is_not_null() # Use only non-null values
)['ProposedUnitsPerCarton'].median()


# Check if global median calculation was successful
if global_valid_median is None:
    print("Warning: Global median calculation returned None. Please check the data.")


In [21]:
# 3. Join the calculated group medians back
df_impute_step = df_density_report.join(
    median_map_garment, on='GarmentType', how='left'
)

# 4. Impute using coalesce (Original -> Group Median -> Global Median)
df_imputed = df_impute_step.with_columns(
    pl.coalesce(
        pl.col('ProposedUnitsPerCarton'),
        pl.col('MedianUnits_Garment'),
        pl.lit(global_valid_median)
    ).alias('ProposedUnitsPerCarton_Imputed')
)

# 5. Clean up temporary column and overwrite original
df_final = df_imputed.drop(['MedianUnits_Garment'])
df_final = df_final.with_columns(
    pl.col('ProposedUnitsPerCarton_Imputed').alias('ProposedUnitsPerCarton')
).drop('ProposedUnitsPerCarton_Imputed')


# Verification
final_null_count = df_final['ProposedUnitsPerCarton'].is_null().sum()
print(f"\nNull count in ProposedUnitsPerCarton AFTER imputation: {final_null_count}")

if final_null_count == 0:
    print("Successfully imputed all null values.")
    print("Describe ProposedUnitsPerCarton after imputation")
    print(df_final.select(pl.col('ProposedUnitsPerCarton')).describe())
else:
    print(f"Warning: {final_null_count} null values remain. Review median calculations and fallback.")

df_density_report = df_final


Null count in ProposedUnitsPerCarton AFTER imputation: 0
Successfully imputed all null values.
Describe ProposedUnitsPerCarton after imputation
shape: (9, 2)
┌────────────┬────────────────────────┐
│ statistic  ┆ ProposedUnitsPerCarton │
│ ---        ┆ ---                    │
│ str        ┆ f64                    │
╞════════════╪════════════════════════╡
│ count      ┆ 500000.0               │
│ null_count ┆ 0.0                    │
│ mean       ┆ 24.75393               │
│ std        ┆ 11.169403              │
│ min        ┆ 0.0                    │
│ 25%        ┆ 16.0                   │
│ 50%        ┆ 25.0                   │
│ 75%        ┆ 32.0                   │
│ max        ┆ 49.0                   │
└────────────┴────────────────────────┘


## 2. Historical Incidents

There is no need to clean this dataset

## 3. Product Attributes

There is no need to clean this dataset

## 4. Supplier Scorecard

### Data Duplicates

In [22]:
# Identify the combinations that are duplicated
duplicated_keys = (
    df_supplier_scorecard
    .group_by(['SupplierName', 'Month'])
    .agg(pl.len().alias('count'))
    .filter(pl.col('count') > 1)
    .select(['SupplierName', 'Month']) # Select only the key columns
)

# Join back to the original data to get ALL columns for the duplicated keys
# Use an inner join to only get rows matching the duplicated keys
view_duplicates = duplicated_keys.join(
    df_supplier_scorecard,
    on=['SupplierName', 'Month'],
    how='inner'
).sort(['SupplierName', 'Month']) # Sort to see duplicates grouped together

print(f"Viewing rows where (SupplierName, Month) combinations are duplicated (showing first ~10 duplicates):")

# Show enough rows to see a few examples of duplicates
print(view_duplicates.head(10))


Viewing rows where (SupplierName, Month) combinations are duplicated (showing first ~10 duplicates):
shape: (10, 8)
┌────────────┬─────────┬────────────┬────────────┬────────────┬────────────┬───────────┬───────────┐
│ SupplierNa ┆ Month   ┆ PackagesHa ┆ BadPackagi ┆ TotalIncid ┆ AverageCos ┆ OnTimeDel ┆ Anomalies │
│ me         ┆ ---     ┆ ndled      ┆ ngRate (%) ┆ ents       ┆ tPerIncide ┆ iveryRate ┆ Detected  │
│ ---        ┆ str     ┆ ---        ┆ ---        ┆ ---        ┆ nt (€)     ┆ (%)       ┆ ---       │
│ str        ┆         ┆ i64        ┆ f64        ┆ i64        ┆ ---        ┆ ---       ┆ i64       │
│            ┆         ┆            ┆            ┆            ┆ f64        ┆ f64       ┆           │
╞════════════╪═════════╪════════════╪════════════╪════════════╪════════════╪═══════════╪═══════════╡
│ SupplierA  ┆ 2023-01 ┆ 102        ┆ 19.61      ┆ 1          ┆ 261.0      ┆ 74.56     ┆ 0         │
│ SupplierA  ┆ 2023-01 ┆ 76         ┆ 14.47      ┆ 3          ┆ 591.0      ┆

In [23]:
# Step 1: Aggregate Data
#shape before aggregation
print(f"Shape before aggregation: {df_supplier_scorecard.shape}")
df_scorecard_agg = df_supplier_scorecard.group_by(['SupplierName', 'Month']).agg([
    pl.sum('PackagesHandled').alias('PackagesHandled'),
    pl.sum('TotalIncidents').alias('TotalIncidents'),
    pl.sum('AnomaliesDetected').alias('AnomaliesDetected'),

    
    (pl.col('PackagesHandled') * pl.col('BadPackagingRate (%)') / 100.0).sum().alias('TotalBadPackages'),
    (pl.col('PackagesHandled') * pl.col('OnTimeDeliveryRate (%)') / 100.0).sum().alias('TotalOnTimePackages'),
    (pl.col('AverageCostPerIncident (€)') * pl.col('TotalIncidents')).sum().alias('TotalIncidentCost')
])

# Step 2: Calculate Final Metrics 
df_scorecard_calculated = df_scorecard_agg.with_columns([
    
    (pl.when(pl.col('PackagesHandled') > 0)
     .then((pl.col('TotalBadPackages') * 100.0) / pl.col('PackagesHandled'))
     .otherwise(0.0)
    ).alias('BadPackagingRate (%)'),

    
    (pl.when(pl.col('PackagesHandled') > 0)
     .then((pl.col('TotalOnTimePackages') * 100.0) / pl.col('PackagesHandled'))
     .otherwise(None)
    ).alias('OnTimeDeliveryRate (%)'),

    
    (pl.when(pl.col('TotalIncidents') > 0)
     .then(pl.col('TotalIncidentCost') / pl.col('TotalIncidents'))
     .otherwise(0.0)
    ).alias('AverageCostPerIncident (€)')
])

# Step 3: Optional Rounding 
df_scorecard_rounded = df_scorecard_calculated.with_columns([
    pl.col('BadPackagingRate (%)').round(2),
    pl.col('OnTimeDeliveryRate (%)').round(2),
    pl.col('AverageCostPerIncident (€)').round(2)
])

# Step 4: Drop Intermediate Columns 
df_scorecard_final = df_scorecard_rounded.drop([
    'TotalBadPackages',
    'TotalOnTimePackages',
    'TotalIncidentCost'
])

#shape after final aggregation
print(f"Shape after final aggregation: {df_scorecard_final.shape}")

# Step 6: Assign back
df_supplier_scorecard = df_scorecard_final


Shape before aggregation: (252, 8)
Shape after final aggregation: (144, 8)


## Export

In [24]:
df_density_report.write_csv(EXPORT_DENSITY_REPORT, separator=";")
df_historical_incidents.write_csv(EXPORT_HISTORICAL_INCIDENTS, separator=";")
df_product_attributes.write_csv(EXPORT_PRODUCT_ATTRIBUTES, separator=";")
df_supplier_scorecard.write_csv(EXPORT_SUPPLIER_SCORECARD, separator=";")