# Data cleaning

## Libraries

In [1]:
pip install fastexcel

Note: you may need to restart the kernel to use updated packages.


Side note: It will be useful to put all the required libraries in a requirements.txt file

In [2]:
import pandas as pd
import polars as pl
import re

  from pandas.core import (


## Global variables

In [3]:
PATH_DENSITY_REPORT       = 'DensityReports.xlsx'
PATH_HISTORICAL_INCIDENTS = 'HistoricalIncidents.xlsx'
PATH_PRODUCT_ATTRIBUTES   = 'ProductAttributes.xlsx'
PATH_SUPPLIER_SCORECARD   = 'SupplierScorecard.xlsx'

EXPORT_DENSITY_REPORT       = 'density_report.csv'
EXPORT_HISTORICAL_INCIDENTS = 'historical_incidents.csv'
EXPORT_PRODUCT_ATTRIBUTES   = 'product_attributes.csv'
EXPORT_SUPPLIER_SCORECARD   = 'supplier_scorecard.csv'

## Global functions

### 1. Read excel files with Polars

In [5]:
def polars_read_excel(file_name, sheet_name='Sheet1'):
  return pl.read_excel(source=file_name, sheet_name=sheet_name)

### 2. Removing invalid strings in Product Reference

In [6]:
def clean_product_reference(series, valid_pattern = r"^PRD\d{5}$"):
  # Convert all the series to text format
  series_string = series.cast(pl.Utf8, strict=False)

  # Removes any "X" character at the end of the string
  cleaned_series = series_string.str.strip_chars_end('X')

  # Deterine if the data has a valid structure
  is_valid = cleaned_series.str.contains(valid_pattern)

  # Convert to None all invalid strings in the data
  return pl.when(is_valid).then(cleaned_series).otherwise(None)

### 3. Modify column based on a dictionary

In [7]:
def clean_column_with_mapping(col_name, mapping_dict):
  return (
        pl.col(col_name)
          .cast(pl.Utf8, strict=False)
          .str.strip_chars()
          .str.replace_all(" ", "")
          .map_elements(lambda val: mapping_dict.get(val, val), return_dtype=pl.Utf8)
          .alias(col_name)
    )

### 4. Most frequent combination of columns

In [8]:
def most_common_combination(df, group_cols, additional_cols=[]):
  return (
      df
      .group_by(group_cols + additional_cols)
      .agg(pl.len().alias("count"))
      .sort(group_cols + ["count"], descending=[False] * len(group_cols) + [True])
      .unique(subset=group_cols, keep='first')
    )

### 5. Replace invalid values

In [9]:
def replace_invalid_values(df: pl.DataFrame,
                           target_col: str,
                           invalid_values: list[str],
                           group_cols: list[str]):

    # Compute most common valid values
    df_most_common = most_common_combination(
        df.filter(~pl.col(target_col).is_in(invalid_values)),
        group_cols,
        additional_cols=[target_col]
    )

    # Replace each invalid value
    fixed_parts = []
    for val in invalid_values:
        df_invalid = df.filter(pl.col(target_col) == val)
        df_replaced = (
            df_invalid.drop(target_col)
            .join(
                df_most_common.select(group_cols + [target_col]),
                on=group_cols,
                how="left"
            )
        )
        fixed_parts.append(df_replaced)

    # Keep valid rows
    df_cleaned = df.filter(~pl.col(target_col).is_in(invalid_values))

    # Align column order and merge all
    column_order = df_cleaned.columns
    fixed_parts = [part.select(column_order) for part in fixed_parts]

    return pl.concat([df_cleaned] + fixed_parts, how="vertical")

### 6. Sentinel Values for `UnitsPerCarton`

In [10]:
def impute_units_per_carton_median(df_input: pl.DataFrame) -> pl.DataFrame:
    median_map_garment = (
        df_input.filter(pl.col('ProposedUnitsPerCarton').is_not_null() & pl.col('GarmentType').is_not_null())
        .group_by('GarmentType')
        .agg(pl.median('ProposedUnitsPerCarton').alias('MedianUnits_Garment'))
    )

    global_valid_median = df_input.filter(
        pl.col('ProposedUnitsPerCarton').is_not_null()
    )['ProposedUnitsPerCarton'].median()

    if global_valid_median is None: 
        global_valid_median = 0 

    df_with_medians = df_input.join(
        median_map_garment, on='GarmentType', how='left' 
    )

    df_imputed = df_with_medians.with_columns(
        pl.coalesce(
            pl.col('ProposedUnitsPerCarton'),
            pl.col('MedianUnits_Garment'),
            pl.lit(global_valid_median) 
        ).alias('ProposedUnitsPerCarton')
    ).drop(['MedianUnits_Garment']) 

    return df_imputed

### 7. `Supplier Scorecard` aggregation function

In [11]:
def aggregate_supplier_scorecard(df_scorecard: pl.DataFrame) -> pl.DataFrame:
    # Step 1: Aggregate raw sums and components for weighted averages
    df_aggregated = df_scorecard.group_by(['SupplierName', 'Month']).agg(
        pl.sum('PackagesHandled').alias('PackagesHandled'),
        pl.sum('TotalIncidents').alias('TotalIncidents'),
        pl.sum('AnomaliesDetected').alias('AnomaliesDetected'), 
        (pl.col('PackagesHandled') * pl.col('BadPackagingRate (%)') / 100.0).sum().alias('TotalBadPackages'),
        (pl.col('PackagesHandled') * pl.col('OnTimeDeliveryRate (%)') / 100.0).sum().alias('TotalOnTimePackages'),
        (pl.col('AverageCostPerIncident (€)') * pl.col('TotalIncidents')).sum().alias('TotalIncidentCost')
    )

    # Step 2: Calculate final rates and average costs
    df_calculated = df_aggregated.with_columns(
        (pl.when(pl.col('PackagesHandled') > 0)
         .then((pl.col('TotalBadPackages') * 100.0) / pl.col('PackagesHandled'))
         .otherwise(0.0) 
        ).alias('BadPackagingRate (%)'),

        (pl.when(pl.col('PackagesHandled') > 0)
         .then((pl.col('TotalOnTimePackages') * 100.0) / pl.col('PackagesHandled'))
         .otherwise(None) 
        ).alias('OnTimeDeliveryRate (%)'),

        (pl.when(pl.col('TotalIncidents') > 0)
         .then(pl.col('TotalIncidentCost') / pl.col('TotalIncidents'))
         .otherwise(0.0) 
        ).alias('AverageCostPerIncident (€)')
    )

    # Step 3: Round calculated metrics
    df_rounded = df_calculated.with_columns(
        pl.col('BadPackagingRate (%)').round(2),
        pl.col('OnTimeDeliveryRate (%)').round(2),
        pl.col('AverageCostPerIncident (€)').round(2)
    )

    # Step 4: Drop intermediate columns used for calculation
    df_final = df_rounded.drop([
        'TotalBadPackages',
        'TotalOnTimePackages',
        'TotalIncidentCost'
    ])

    return df_final

## File reading

In [12]:
df_density_report      = polars_read_excel(PATH_DENSITY_REPORT)
df_historical_incidents = polars_read_excel(PATH_HISTORICAL_INCIDENTS)
df_product_attributes   = polars_read_excel(PATH_PRODUCT_ATTRIBUTES)
df_supplier_scorecard   = polars_read_excel(PATH_SUPPLIER_SCORECARD)

## Execution

## 1. Density Report

### 1.1 Product Reference correction

In [13]:
df_density_report = df_density_report.with_columns(
    clean_product_reference(pl.col("ProductReference")).alias("ProductReference")
)

In [14]:
df_density_report.group_by(pl.col('ProposedFoldingMethod')).len().sort('ProposedFoldingMethod')

ProposedFoldingMethod,len
str,u32
"""FoldX""",2426
"""Methd1""",2450
"""Method1""",157652
"""Method2""",218201
"""Method3""",114363
"""Method_2""",2394
"""None""",2514


### 1.2 Naming consistency generation

In [15]:
# We define the dictionary of values we want to change in suppliers
supplier_mapping = {
    'SuplA':     'SupplierA',
    'supplierA': 'SupplierA',
    'SuppB':     'SupplierB',
    'SupllierC': 'SupplierC',
    'SPLF':      'SupplierF',
    'supplierh': 'SupplierH',
}

# We modify the Supplier column in density_report
df_density_report = df_density_report.with_columns(
    clean_column_with_mapping('SupplierName', supplier_mapping)
)

# Apply to df_historical_incidents
df_historical_incidents = df_historical_incidents.with_columns(
    clean_column_with_mapping('SupplierName', supplier_mapping)
)

# Apply to df_supplier_scorecard
df_supplier_scorecard = df_supplier_scorecard.with_columns(
    clean_column_with_mapping('SupplierName', supplier_mapping)
)

# We define the dictionary of values we want to change in method
method_mapping = {
    'Methd1': 'Method1',
    'Method_2': 'Method2',
}

# We modify the Folding Method column
df_density_report = df_density_report.with_columns(
    clean_column_with_mapping('ProposedFoldingMethod', method_mapping)
)

# We define the dictionary of values we want to change in layout
layout_mapping = {
    'layouta': 'LayoutA',
    'LayC': 'LayoutC',
}

# We modify the Proposed Layout column
df_density_report = df_density_report.with_columns(
    clean_column_with_mapping('ProposedLayout', layout_mapping)
)

# We define the dictionary of values we want to change in quality
quality_mapping = {
    'GOOD': 'Good',
    'bad': 'Bad',
}

# We modify the Packaging Quality column
df_density_report = df_density_report.with_columns(
    clean_column_with_mapping('PackagingQuality', quality_mapping)
)

### 1.3 Incorrect input labels modification

In [16]:
# Fix ProposedFoldingMethod
df_density_report = replace_invalid_values(
    df_density_report,
    target_col="ProposedFoldingMethod",
    invalid_values=["FoldX", "None"],
    group_cols=[
        "SupplierName",
        "GarmentType",
        "Material",
        "Weight",
        "ProposedUnitsPerCarton",
        "ProposedLayout",
        "PackagingQuality"
    ]
)

# Fix ProposedLayout
df_density_report = replace_invalid_values(
    df_density_report,
    target_col="ProposedLayout",
    invalid_values=["Box9", "LayoutX"],
    group_cols=[
        "SupplierName",
        "GarmentType",
        "Material",
        "Weight",
        "ProposedUnitsPerCarton",
        "ProposedFoldingMethod",
        "PackagingQuality"
    ]
)

### 1.4 Outliers in `ProposedUnitsPerCarton`

In [17]:
# Define the sentinel values
invalid_codes_units = [-3.0, 9999.0, 12.5] 

df_density_report = df_density_report.with_columns(
    pl.when(pl.col('ProposedUnitsPerCarton').is_in(invalid_codes_units))
    .then(None) 
    .otherwise(pl.col('ProposedUnitsPerCarton'))
    .alias('ProposedUnitsPerCarton')
)

print(f"Null count in ProposedUnitsPerCarton after sentinel removal: {df_density_report['ProposedUnitsPerCarton'].is_null().sum()}")

# Step 2: Impute all nulls in ProposedUnitsPerCarton
df_density_report = impute_units_per_carton_median(df_density_report.clone()) 

print(f"Null count in ProposedUnitsPerCarton after imputation: {df_density_report['ProposedUnitsPerCarton'].is_null().sum()}")
print(df_density_report.select('ProposedUnitsPerCarton').describe())


Null count in ProposedUnitsPerCarton after sentinel removal: 11297
Null count in ProposedUnitsPerCarton after imputation: 0
shape: (9, 2)
┌────────────┬────────────────────────┐
│ statistic  ┆ ProposedUnitsPerCarton │
│ ---        ┆ ---                    │
│ str        ┆ f64                    │
╞════════════╪════════════════════════╡
│ count      ┆ 500000.0               │
│ null_count ┆ 0.0                    │
│ mean       ┆ 24.75393               │
│ std        ┆ 11.169403              │
│ min        ┆ 0.0                    │
│ 25%        ┆ 16.0                   │
│ 50%        ┆ 25.0                   │
│ 75%        ┆ 32.0                   │
│ max        ┆ 49.0                   │
└────────────┴────────────────────────┘


## 2. Historical Incidents

No further cleaning of this dataset is necessary.

## 3. Product Attributes

No further cleaning of this dataset is necessary.

## 4. Supplier Scorecard

### Data Duplicates

In [18]:
df_supplier_scorecard = aggregate_supplier_scorecard(df_supplier_scorecard.clone()) 

## Export

In [19]:
df_density_report.write_csv(EXPORT_DENSITY_REPORT, separator=";")
df_historical_incidents.write_csv(EXPORT_HISTORICAL_INCIDENTS, separator=";")
df_product_attributes.write_csv(EXPORT_PRODUCT_ATTRIBUTES, separator=";")
df_supplier_scorecard.write_csv(EXPORT_SUPPLIER_SCORECARD, separator=";")