# Data cleaning

## Necessary libraries

In [1]:
pip install fastexcel

Collecting fastexcel
  Downloading fastexcel-0.14.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Downloading fastexcel-0.14.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastexcel
Successfully installed fastexcel-0.14.0


## Libraries import

In [2]:
import pandas as pd
import polars as pl

## Global variables

In [3]:
PATH_DENSITY_REPORT       = 'DensityReports.xlsx'
PATH_HISTORICAL_INCIDENTS = 'HistoricalIncidents.xlsx'
PATH_PRODUCT_ATTRIBUTES   = 'ProductAttributes.xlsx'
PATH_SUPPLIER_SCORECARD   = 'SupplierScorecard.xlsx'

EXPORT_DENSITY_REPORT       = 'density_report.csv'
EXPORT_HISTORICAL_INCIDENTS = 'historical_incidents.csv'
EXPORT_PRODUCT_ATTRIBUTES   = 'product_attributes.csv'
EXPORT_SUPPLIER_SCORECARD   = 'supplier_scorecard.csv'

## Global functions

### 1. Read excel file with Polars

In [4]:
def polars_read_excel(file_name, sheet_name='Sheet1'):
  return pl.read_excel(source=file_name, sheet_name=sheet_name)

## Execution

## 1. Density Report

In [6]:
# Read density report file with Polars
df_density_report = polars_read_excel(PATH_DENSITY_REPORT)

# we will check the first rows of the file
df_density_report.head()

ReportID,ProductReference,DateOfReport,SupplierName,GarmentType,Material,Weight,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout,PackagingQuality
str,str,date,str,str,str,f64,f64,str,str,str
"""RPT0000001""","""PRD07271""",2024-03-04,"""SupplierA""","""Pants""","""Polyester""",0.35,29.0,"""Method2""","""LayoutC""","""Good"""
"""RPT0000002""","""PRD00861""",2024-05-27,"""SupplierC""","""T-Shirt""","""Denim""",0.21,20.0,"""Method2""","""LayoutB""","""Good"""
"""RPT0000003""","""PRD05391""",2023-11-18,"""SupplierA""","""Shirt""","""Cotton""",0.2,31.0,"""Method1""","""LayoutA""","""Good"""
"""RPT0000004""","""PRD05192""",2024-06-13,"""SupplierA""","""Coat""","""Cotton""",1.3,5.0,"""Method1""","""LayoutD""","""Good"""
"""RPT0000005""","""PRD05735""",2023-07-18,"""SupplierA""","""Coat""","""Polyester""",1.11,9.0,"""Method2""","""LayoutD""","""Good"""


In [7]:
df_density_report.describe()

statistic,ReportID,ProductReference,DateOfReport,SupplierName,GarmentType,Material,Weight,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout,PackagingQuality
str,str,str,str,str,str,str,f64,f64,str,str,str
"""count""","""500000""","""500000""","""500000""","""500000""","""500000""","""500000""",500000.0,500000.0,"""500000""","""500000""","""500000"""
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,0.0,"""0""","""0""","""0"""
"""mean""",,,"""2023-09-30 15:58:47.712000""",,,,0.461021,99.981055,,,
"""std""",,,,,,,0.349694,864.741016,,,
"""min""","""RPT0000001""","""PRD00""","""2023-01-01""","""SPLF""","""Blouse""","""Cotton""",0.08,-3.0,"""FoldX""","""Box9""","""Bad"""
"""25%""",,,"""2023-05-17""",,,,0.21,16.0,,,
"""50%""",,,"""2023-09-30""",,,,0.33,25.0,,,
"""75%""",,,"""2024-02-14""",,,,0.62,32.0,,,
"""max""","""RPT0500000""","""PRD10000X""","""2024-06-30""","""supplierh""","""T-Shirt""","""Wool""",2.32,9999.0,"""None""","""layouta""","""bad"""


In [None]:
# now we will see the structure of the file
df_density_report.schema

Schema([('ReportID', String),
        ('ProductReference', String),
        ('DateOfReport', Datetime(time_unit='ns', time_zone=None)),
        ('SupplierName', String),
        ('GarmentType', String),
        ('Material', String),
        ('Weight', Float64),
        ('ProposedUnitsPerCarton', Float64),
        ('ProposedFoldingMethod', String),
        ('ProposedLayout', String),
        ('PackagingQuality', String)])

### Naming consistency

We will check if there are  grammar errors in columns like `SupplierName`, `GarmentType`, `Material`, `ProposedFoldingMethod`, `ProposedLayout` and `PackagingQuality`to avoid repetition

In [None]:

columns_to_check = [
    'SupplierName',
    'GarmentType',
    'Material',
    'ProposedFoldingMethod',
    'ProposedLayout',
    'PackagingQuality']

print("Checking Unique Values for Potential Inconsistencies")

for col_name in columns_to_check:
    if col_name in df_density_report.columns:
        try:
            unique_values = (
                df_density_report[col_name]
                .unique()
                .sort()
            )

            print(f"Unique values in: {col_name}")
            print(unique_values.to_list())
            print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

        except Exception as e:
            print(f" Could not process column: {col_name} ")
            print(f"Error: {e}")
    else:
        print(f"Column not found: {col_name}")



Checking Unique Values for Potential Inconsistencies
Unique values in: SupplierName
['SPLF', 'SuplA', 'SupllierC', 'SuppB', 'SupplierA', 'SupplierB', 'SupplierC', 'SupplierD', 'SupplierE', 'SupplierF', 'SupplierG', 'SupplierH', 'supplierA', 'supplierh']
Total unique non-null values: 14
Unique values in: GarmentType
['Blouse', 'Coat', 'Dress', 'Hoodie', 'Jacket', 'Pants', 'Shirt', 'Shorts', 'Skirt', 'Suit', 'Sweater', 'T-Shirt']
Total unique non-null values: 12
Unique values in: Material
['Cotton', 'Denim', 'Linen', 'Polyester', 'Silk', 'Wool']
Total unique non-null values: 6
Unique values in: ProposedFoldingMethod
[None, 'FoldX', 'Methd1', 'Method1', 'Method2', 'Method3', 'Method_2']
Total unique non-null values: 6
Unique values in: ProposedLayout
['Box9', 'LayC', 'LayoutA', 'LayoutB', 'LayoutC', 'LayoutD', 'LayoutE', 'LayoutX', 'layouta']
Total unique non-null values: 9
Unique values in: PackagingQuality
['Bad', 'GOOD', 'Good', 'Uncertain', 'bad']
Total unique non-null values: 5


As we can see, we do have some errors in naming in columns `SupplierName`, `ProposedFoldingMethod`, `ProposedLayout` and `PackagingQuality`. So we need to fix these inconsistencies

In [None]:
#fixing SupplierName
supplier_mapping = {
    'SuplA': 'SupplierA',
    'supplierA': 'SupplierA',
    'SuppB': 'SupplierB',
    'SupllierC': 'SupplierC',
    'SPLF': 'SupplierF',
    'supplierh': 'SupplierH',
}

# Apply the mapping
df_density_report = df_density_report.with_columns(
    pl.col('SupplierName')
      .str.strip_chars()
      .replace(supplier_mapping)
      .str.replace_all(" ", "")
      .alias('SupplierName')
)

# Check the unique values again
unique_values = (
    df_density_report['SupplierName']
    .unique()
    .sort()
)
print(f" Unique values in: SupplierName")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

 Unique values in: SupplierName
['SupplierA', 'SupplierB', 'SupplierC', 'SupplierD', 'SupplierE', 'SupplierF', 'SupplierG', 'SupplierH']
Total unique non-null values: 8


In [None]:
#fixing ProposedFoldingMethod
method_mapping = {
    'Methd1': 'Method1',
    'Method_2': 'Method2',
}

# Apply the mapping
df_density_report = df_density_report.with_columns(
    pl.col('ProposedFoldingMethod')
      .str.strip_chars()
      .replace(method_mapping)
      .str.replace_all(" ", "")
      .alias('ProposedFoldingMethod')
)

# Check the unique values again
unique_values = (
    df_density_report['ProposedFoldingMethod']
    .unique()
    .sort()
)
print(f" Unique values in: ProposedFoldingMethod")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

 Unique values in: ProposedFoldingMethod
[None, 'FoldX', 'Method1', 'Method2', 'Method3']
Total unique non-null values: 4


In [None]:
#fixing ProposedLayout
layout_mapping = {
    'layouta': 'LayoutA',
    'LayC': 'LayoutC',
}

# Apply the mapping
df_density_report = df_density_report.with_columns(
    pl.col('ProposedLayout')
      .str.strip_chars()
      .replace(layout_mapping)
      .str.replace_all(" ", "")
      .alias('ProposedLayout')
)

# Check the unique values again
unique_values = (
    df_density_report['ProposedLayout']
    .unique()
    .sort()
)
print(f"Unique values in: ProposedLayout")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

Unique values in: ProposedLayout
['Box9', 'LayoutA', 'LayoutB', 'LayoutC', 'LayoutD', 'LayoutE', 'LayoutX']
Total unique non-null values: 7


In [None]:
#fixing PackagingQuality
quality_mapping = {
    'GOOD': 'Good',
    'bad': 'Bad',
}

# Apply the mapping
df_density_report = df_density_report.with_columns(
    pl.col('PackagingQuality')
      .str.strip_chars()
      .replace(quality_mapping)
      .str.replace_all(" ", "")
      .alias('PackagingQuality')
)

# Check the unique values again
unique_values = (
    df_density_report['PackagingQuality']
    .unique()
    .sort()
)
print(f" Unique values in: PackagingQuality")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

 Unique values in: PackagingQuality
['Bad', 'Good', 'Uncertain']
Total unique non-null values: 3


### Nulls

In [None]:
# do we have null values?
df_density_report.null_count()

ReportID,ProductReference,DateOfReport,SupplierName,GarmentType,Material,Weight,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout,PackagingQuality
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,2514,0,0


We only have missing values in 1 column `ProposedFoldingMethod` with **2514** out of 500,000 (around **0.5%** of the data).

In [None]:
#check the column with missing values ProposedFoldingMethod
df_density_report.filter(pl.col('ProposedFoldingMethod').is_null()).select(
    ['DateOfReport','SupplierName','GarmentType','Material','Weight','ProposedUnitsPerCarton','ProposedFoldingMethod','ProposedLayout']
).head(10)

DateOfReport,SupplierName,GarmentType,Material,Weight,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout
datetime[ns],str,str,str,f64,f64,str,str
2023-01-06 00:00:00,"""SupplierB""","""Suit""","""Polyester""",0.88,13.0,,"""LayoutC"""
2023-10-04 00:00:00,"""SupplierC""","""Pants""","""Linen""",0.44,22.0,,"""LayoutC"""
2024-01-10 00:00:00,"""SupplierB""","""Pants""","""Silk""",0.32,32.0,,"""LayoutC"""
2023-09-10 00:00:00,"""SupplierE""","""Shirt""","""Cotton""",0.18,49.0,,"""LayoutB"""
2024-04-28 00:00:00,"""SupplierA""","""Shorts""","""Cotton""",0.33,33.0,,"""LayoutB"""
2023-10-10 00:00:00,"""SupplierC""","""Coat""","""Denim""",1.88,8.0,,"""LayoutE"""
2023-09-14 00:00:00,"""SupplierE""","""Shorts""","""Cotton""",0.33,31.0,,"""LayoutC"""
2023-05-13 00:00:00,"""SupplierE""","""Blouse""","""Polyester""",0.18,43.0,,"""LayoutB"""
2023-04-26 00:00:00,"""SupplierB""","""Jacket""","""Polyester""",0.67,19.0,,"""LayoutC"""
2024-04-17 00:00:00,"""SupplierA""","""Dress""","""Silk""",0.39,25.0,,"""LayoutB"""


Instead of just deleting these rows (and losing data) or picking a random method, we'll make an educated guess based on patterns in the existing data. The most logical assumption is that garments of a similar type, especially when intended for a specific packaging layout are likely folded using the same standard method.

In [None]:
mode_map_combined = (
    df_density_report
    .filter(pl.col('ProposedFoldingMethod').is_not_null())
    .group_by(['GarmentType', 'ProposedLayout'])
    .agg(
        pl.col('ProposedFoldingMethod').mode().first().alias('ModeFoldingMethod_Combined')
    )
)

print("--- Mode Folding Method per (GarmentType, ProposedLayout) ---")
print(mode_map_combined)
print(f"Number of combined groups found: {len(mode_map_combined)}")


# --- Calculate Mode based on GarmentType alone
mode_map_garment = (
    df_density_report
    .filter(pl.col('ProposedFoldingMethod').is_not_null())
    .group_by('GarmentType')
    .agg(
        pl.col('ProposedFoldingMethod').mode().first().alias('ModeFoldingMethod_Garment')
    )
)
print("\n--- Mode Folding Method per GarmentType ---")
print(mode_map_garment)

# --- Calculate Global Mode
global_mode = df_density_report.filter(
    pl.col('ProposedFoldingMethod').is_not_null()
)['ProposedFoldingMethod'].mode().first()

print(f"\nGlobal Mode Folding Method: {global_mode}")

--- Mode Folding Method per (GarmentType, ProposedLayout) ---
shape: (71, 3)
┌─────────────┬────────────────┬────────────────────────────┐
│ GarmentType ┆ ProposedLayout ┆ ModeFoldingMethod_Combined │
│ ---         ┆ ---            ┆ ---                        │
│ str         ┆ str            ┆ str                        │
╞═════════════╪════════════════╪════════════════════════════╡
│ Skirt       ┆ LayoutX        ┆ Method2                    │
│ T-Shirt     ┆ LayoutB        ┆ Method2                    │
│ Coat        ┆ LayoutC        ┆ Method3                    │
│ Hoodie      ┆ LayoutX        ┆ Method1                    │
│ Sweater     ┆ LayoutC        ┆ Method2                    │
│ …           ┆ …              ┆ …                          │
│ Dress       ┆ LayoutB        ┆ Method2                    │
│ Suit        ┆ LayoutB        ┆ Method3                    │
│ Skirt       ┆ LayoutA        ┆ Method3                    │
│ Skirt       ┆ LayoutC        ┆ Method2               

We can see the typical method used for specific `GarmentType` / `ProposedLayout` pairs. We can see for example 'Method1' was common for `Sweater`/`LayoutX`, while many others used 'Method2'.
*   We also see the dominant method for each `GarmentType` individually.
*   The overall most frequent method across all known entries was also determined  `Method2`.

Now that we've identified these 'best guess' methods based on context, we'll use them to fill in the missing values in the original dataset.

In [None]:
# Join the calculated modes back to the main dataframe
df_impute_step1 = df_density_report.join(
    mode_map_combined,
    on=['GarmentType', 'ProposedLayout'],
    how='left'  # Keep all original rows
)

df_impute_step2 = df_impute_step1.join(
    mode_map_garment,
    on='GarmentType',
    how='left'
)

# Impute using coalesce, prioritizing combined, then garment, then global
df_imputed = df_impute_step2.with_columns(
    pl.coalesce(
        pl.col('ProposedFoldingMethod'),
        pl.col('ModeFoldingMethod_Combined'),
        pl.col('ModeFoldingMethod_Garment'),
        pl.lit(global_mode)
    ).alias('ProposedFoldingMethod_Imputed')
)

# Clean up temporary columns
df_final_imputed = df_imputed.drop([
    'ModeFoldingMethod_Combined',
    'ModeFoldingMethod_Garment'
])

df_final_imputed = df_final_imputed.with_columns(
    pl.col('ProposedFoldingMethod_Imputed').alias('ProposedFoldingMethod')
).drop('ProposedFoldingMethod_Imputed')


# Verify the imputation
null_count_after = df_final_imputed['ProposedFoldingMethod'].is_null().sum()
print(f"\nNull count in ProposedFoldingMethod AFTER imputation: {null_count_after}")

if null_count_after == 0:
    print("Successfully imputed all null values.")
else:
    print(f"Warning: {null_count_after} null values remain. Global mode might not have been defined or some groups had no data.")

# Inspect some previously null rows
print("\nExample rows that were previously null:")
print(
    df_density_report
    .filter(pl.col('ProposedFoldingMethod').is_null())
    .select(['DateOfReport', 'GarmentType', 'ProposedLayout'])
    .head(10)
    .join(
        df_final_imputed.select(['DateOfReport', 'GarmentType', 'ProposedLayout', 'ProposedFoldingMethod']),
        on=['DateOfReport', 'GarmentType', 'ProposedLayout'], #
        how='left'
    )
)

df_density_report = df_final_imputed


Null count in ProposedFoldingMethod AFTER imputation: 0
Successfully imputed all null values.

Example rows that were previously null:
shape: (433, 4)
┌─────────────────────┬─────────────┬────────────────┬───────────────────────┐
│ DateOfReport        ┆ GarmentType ┆ ProposedLayout ┆ ProposedFoldingMethod │
│ ---                 ┆ ---         ┆ ---            ┆ ---                   │
│ datetime[ns]        ┆ str         ┆ str            ┆ str                   │
╞═════════════════════╪═════════════╪════════════════╪═══════════════════════╡
│ 2023-01-06 00:00:00 ┆ Suit        ┆ LayoutC        ┆ Method2               │
│ 2023-01-06 00:00:00 ┆ Suit        ┆ LayoutC        ┆ Method2               │
│ 2023-01-06 00:00:00 ┆ Suit        ┆ LayoutC        ┆ Method1               │
│ 2023-01-06 00:00:00 ┆ Suit        ┆ LayoutC        ┆ Method1               │
│ 2023-01-06 00:00:00 ┆ Suit        ┆ LayoutC        ┆ Method1               │
│ …                   ┆ …           ┆ …              ┆ …  

The process of filling the missing `ProposedFoldingMethod` values is complete. The logic prioritized using the most specific contextual information available (combined group mode > garment type mode > global mode) to make the imputations as reasonable as possible.

Now, we will erform a final check to confirm that all the missing values have indeed been filled!

In [None]:
# do we have null values?
df_density_report.null_count()

ReportID,ProductReference,DateOfReport,SupplierName,GarmentType,Material,Weight,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout,PackagingQuality
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Do we have duplicate rows
num_duplicates = df_density_report.is_duplicated().sum()
print(f"Number of duplicate rows found: {num_duplicates}")

Number of duplicate rows found: 0


### Time of the day

Looking at the data we saw that `DateOfReport` seemed to have only the hours 00:00:00, we will find out if thats the case. If that happens across the dataset, we will remove it because of redundancy.

In [None]:
# we will check if we have different hours than midnight for DateOfReport or if everything is at midnight
from datetime import time

# Check the current data type of the column
current_dtype = df_density_report['DateOfReport'].dtype

if current_dtype == pl.Date:
    print("The 'DateOfReport' column is already of type Date. Time component has been removed.")
    has_non_midnight = False
elif current_dtype == pl.Datetime:
    print("Checking for non-midnight times in Datetime column...")
    has_non_midnight = df_density_report.select(
        (pl.col('DateOfReport').dt.time() != time(0, 0, 0)).any()
    ).item()
    print(f"Has non-midnight times (using time object): {has_non_midnight}")
else:
    print(f"Unexpected data type for 'DateOfReport': {current_dtype}")
    has_non_midnight = None

Checking for non-midnight times in Datetime column...
Has non-midnight times (using time object): False


Because we have the same hour for all of our entries(midnight) we will remove it and just leave the date

In [None]:
if not has_non_midnight:
    print("All times are midnight. Converting 'DateOfReport' to Date type.")
    df_density_report = df_density_report.with_columns(
        pl.col('DateOfReport').cast(pl.Date)
    )
    # Verify the change
    print (df_density_report.head())
else:
    print("Non-midnight times found. Keeping 'DateOfReport' as Datetime.")

All times are midnight. Converting 'DateOfReport' to Date type.
shape: (5, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ ReportID  ┆ ProductRe ┆ DateOfRep ┆ SupplierN ┆ … ┆ ProposedU ┆ ProposedF ┆ ProposedL ┆ Packagin │
│ ---       ┆ ference   ┆ ort       ┆ ame       ┆   ┆ nitsPerCa ┆ oldingMet ┆ ayout     ┆ gQuality │
│ str       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ rton      ┆ hod       ┆ ---       ┆ ---      │
│           ┆ str       ┆ date      ┆ str       ┆   ┆ ---       ┆ ---       ┆ str       ┆ str      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆ str       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ RPT000000 ┆ PRD07271  ┆ 2024-03-0 ┆ SupplierA ┆ … ┆ 29.0      ┆ Method2   ┆ LayoutC   ┆ Good     │
│ 1         ┆           ┆ 4         ┆           ┆   ┆           ┆           ┆           ┆          │
│ RPT000000 

### Numerical values

We will check if theres some strange values in the numerical columns: `Weight` and `ProposedUnitsPerCarton`

In [None]:
# check value distribution for Weight
df_density_report.select(
    pl.col('Weight')
).describe().to_pandas().T

Unnamed: 0,0,1,2,3,4,5,6,7,8
statistic,count,null_count,mean,std,min,25%,50%,75%,max
Weight,500000.0,0.0,0.461021,0.349694,0.08,0.21,0.33,0.62,2.32


In [None]:
# check value distribution for Weight
df_density_report.select(
    pl.col('ProposedUnitsPerCarton')
).describe().to_pandas().T

Unnamed: 0,0,1,2,3,4,5,6,7,8
statistic,count,null_count,mean,std,min,25%,50%,75%,max
ProposedUnitsPerCarton,500000.0,0.0,99.981055,864.741016,-3.0,16.0,25.0,32.0,9999.0


We can see two very noticeable things in `ProposedUnitsPerCarton`. First, we have negative values, which should be impossible and also a maximum value of 9999. We need to clean this situation

In [None]:
count_9999 = df_density_report.filter(pl.col('ProposedUnitsPerCarton') == 9999.0).height
print(f"Number of rows with ProposedUnitsPerCarton == 9999.0: {count_9999}")

Number of rows with ProposedUnitsPerCarton == 9999.0: 3786


In [None]:
#check how many negative values we have in proposed units per carton(count)
negative_count = df_density_report.filter(pl.col('ProposedUnitsPerCarton') < 0).height
print(f"Number of negative values in ProposedUnitsPerCarton: {negative_count}")

# See the unique negative values
negative_values = df_density_report.filter(
    pl.col('ProposedUnitsPerCarton') < 0
)['ProposedUnitsPerCarton'].unique().sort()

print("Unique negative values found:")
print(negative_values)


Number of negative values in ProposedUnitsPerCarton: 3754
Unique negative values found:
shape: (1,)
Series: 'ProposedUnitsPerCarton' [f64]
[
	-3.0
]


We have **3754** negative values, about **0.75%** of 500,000 rows. `9999.0` occurred **3,786** times (approx. **0.76%**). The thing is, Both `-3.0` and `9999.0` seem to be codes or sentinel values indicating missing, unavailable or erroneous data, rather than actual quantities. We will replace all occurrences of *both* `-3.0` and `9999.0` with `None` (null).

In [None]:
original_null_count = df_density_report['ProposedUnitsPerCarton'].is_null().sum()
original_min = df_density_report['ProposedUnitsPerCarton'].min()
original_max = df_density_report['ProposedUnitsPerCarton'].max()
print(f"- Null count before cleaning codes: {original_null_count}")
print(f"- Min value before cleaning codes: {original_min}")
print(f"- Max value before cleaning codes: {original_max}")

# Define the list of invalid code values to replace
invalid_codes = [-3.0, 9999.0]
num_negatives_to_replace = 3754 # Count of -3.0
num_9999_to_replace = 3786    # Count of 9999.0
total_expected_increase = num_negatives_to_replace + num_9999_to_replace

# Replace both specific invalid codes with null using .is_in()
df_density_report = df_density_report.with_columns(
    pl.when(pl.col('ProposedUnitsPerCarton').is_in(invalid_codes))
    .then(None)
    .otherwise(pl.col('ProposedUnitsPerCarton'))
    .alias('ProposedUnitsPerCarton') )

# Verification
# 1. Check the new null count (should have increased by total_expected_increase)
null_count_after = df_density_report['ProposedUnitsPerCarton'].is_null().sum()
print("\nVerifying replacement:")
print(f"- Null count in ProposedUnitsPerCarton AFTER cleaning: {null_count_after}")
if null_count_after == original_null_count + total_expected_increase:
    print(f"  >> Confirmation: Null count increased by {total_expected_increase} as expected.")
else:
    print(f"  >> Warning: Null count change ({null_count_after - original_null_count}) doesn't match expected ({total_expected_increase}). Please review.")

# 2. Check the minimum value (should no longer be negative)
min_after_cleaning = df_density_report['ProposedUnitsPerCarton'].min()
print(f"- Minimum value in ProposedUnitsPerCarton after cleaning: {min_after_cleaning}")

# 3. Check the maximum value (should no longer be 9999.0)
max_after_cleaning = df_density_report['ProposedUnitsPerCarton'].max()
print(f"- Maximum value in ProposedUnitsPerCarton after cleaning: {max_after_cleaning}")

# 4. Check if the specific codes remain (should be none)
remaining_negatives = df_density_report.filter(pl.col('ProposedUnitsPerCarton') == -3.0).height
remaining_9999 = df_density_report.filter(pl.col('ProposedUnitsPerCarton') == 9999.0).height
print(f"- Remaining count of -3.0: {remaining_negatives}")
print(f"- Remaining count of 9999.0: {remaining_9999}")
if remaining_negatives == 0 and remaining_9999 == 0:
     print("  >> Confirmation: Invalid codes successfully removed.")
else:
     print("  >> Warning: Some invalid code values still detected.")

- Null count before cleaning codes: 0
- Min value before cleaning codes: -3.0
- Max value before cleaning codes: 9999.0

Verifying replacement:
- Null count in ProposedUnitsPerCarton AFTER cleaning: 7540
  >> Confirmation: Null count increased by 7540 as expected.
- Minimum value in ProposedUnitsPerCarton after cleaning: 0.0
- Maximum value in ProposedUnitsPerCarton after cleaning: 49.0
- Remaining count of -3.0: 0
- Remaining count of 9999.0: 0
  >> Confirmation: Invalid codes successfully removed.


Now we have **7,540** null values, aprox **1.5%** of the data. We will follow a similar approach to inpute the nulls like we did before, finding the median for each garment type.

In [None]:
1. #Calculate median per GarmentType (filter out nulls first)
median_map_garment = (
    df_density_report.filter(pl.col('ProposedUnitsPerCarton').is_not_null()) # Use only non-null values for calculation
    .group_by('GarmentType')
    .agg(pl.median('ProposedUnitsPerCarton').alias('MedianUnits_Garment'))
)
print("\n--- Median Units per GarmentType (calculated from valid data) ---")
print(median_map_garment)

# 2. Calculate global median from valid data
global_valid_median = df_density_report.filter(
    pl.col('ProposedUnitsPerCarton').is_not_null() # Use only non-null values
)['ProposedUnitsPerCarton'].median()

print(f"\nGlobal valid median (for fallback): {global_valid_median}")

# Check if global median calculation was successful
if global_valid_median is None:
    print("Warning: Could not calculate global median. Imputation might fail.")



--- Median Units per GarmentType (calculated from valid data) ---
shape: (12, 2)
┌─────────────┬─────────────────────┐
│ GarmentType ┆ MedianUnits_Garment │
│ ---         ┆ ---                 │
│ str         ┆ f64                 │
╞═════════════╪═════════════════════╡
│ Skirt       ┆ 27.0                │
│ Pants       ┆ 26.0                │
│ Suit        ┆ 11.0                │
│ Coat        ┆ 7.0                 │
│ Sweater     ┆ 15.0                │
│ …           ┆ …                   │
│ Shorts      ┆ 27.0                │
│ Hoodie      ┆ 13.0                │
│ Jacket      ┆ 14.0                │
│ Shirt       ┆ 33.0                │
│ Dress       ┆ 22.0                │
└─────────────┴─────────────────────┘

Global valid median (for fallback): 25.0


In [None]:
# 3. Join the calculated group medians back
df_impute_step = df_density_report.join(
    median_map_garment, on='GarmentType', how='left'
)

# 4. Impute using coalesce (Original -> Group Median -> Global Median)
df_imputed = df_impute_step.with_columns(
    pl.coalesce(
        pl.col('ProposedUnitsPerCarton'),
        pl.col('MedianUnits_Garment'),
        pl.lit(global_valid_median)
    ).alias('ProposedUnitsPerCarton_Imputed')
)

# 5. Clean up temporary column and overwrite original
df_final = df_imputed.drop(['MedianUnits_Garment'])
df_final = df_final.with_columns(
    pl.col('ProposedUnitsPerCarton_Imputed').alias('ProposedUnitsPerCarton')
).drop('ProposedUnitsPerCarton_Imputed')


# Verification
final_null_count = df_final['ProposedUnitsPerCarton'].is_null().sum()
print(f"\nNull count in ProposedUnitsPerCarton AFTER imputation: {final_null_count}")

if final_null_count == 0:
    print("Successfully imputed all null values.")
    print("Describe ProposedUnitsPerCarton after imputation")
    print(df_final.select(pl.col('ProposedUnitsPerCarton')).describe())
else:
    print(f"Warning: {final_null_count} null values remain. Review median calculations and fallback.")

df_density_report = df_final


Null count in ProposedUnitsPerCarton AFTER imputation: 0
Successfully imputed all null values.
Describe ProposedUnitsPerCarton after imputation
shape: (9, 2)
┌────────────┬────────────────────────┐
│ statistic  ┆ ProposedUnitsPerCarton │
│ ---        ┆ ---                    │
│ str        ┆ f64                    │
╞════════════╪════════════════════════╡
│ count      ┆ 500000.0               │
│ null_count ┆ 0.0                    │
│ mean       ┆ 24.663351              │
│ std        ┆ 11.1913                │
│ min        ┆ 0.0                    │
│ 25%        ┆ 16.0                   │
│ 50%        ┆ 25.0                   │
│ 75%        ┆ 32.0                   │
│ max        ┆ 49.0                   │
└────────────┴────────────────────────┘


In [None]:
df_density_report

ReportID,ProductReference,DateOfReport,SupplierName,GarmentType,Material,Weight,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout,PackagingQuality
str,str,date,str,str,str,f64,f64,str,str,str
"""RPT0000001""","""PRD07271""",2024-03-04,"""SupplierA""","""Pants""","""Polyester""",0.35,29.0,"""Method2""","""LayoutC""","""Good"""
"""RPT0000002""","""PRD00861""",2024-05-27,"""SupplierC""","""T-Shirt""","""Denim""",0.21,20.0,"""Method2""","""LayoutB""","""Good"""
"""RPT0000003""","""PRD05391""",2023-11-18,"""SupplierA""","""Shirt""","""Cotton""",0.2,31.0,"""Method1""","""LayoutA""","""Good"""
"""RPT0000004""","""PRD05192""",2024-06-13,"""SupplierA""","""Coat""","""Cotton""",1.3,5.0,"""Method1""","""LayoutD""","""Good"""
"""RPT0000005""","""PRD05735""",2023-07-18,"""SupplierA""","""Coat""","""Polyester""",1.11,9.0,"""Method2""","""LayoutD""","""Good"""
…,…,…,…,…,…,…,…,…,…,…
"""RPT0499996""","""PRD06239""",2023-03-26,"""SupplierB""","""T-Shirt""","""Polyester""",0.13,43.0,"""Method1""","""LayoutB""","""Bad"""
"""RPT0499997""","""PRD02248""",2023-11-16,"""SupplierB""","""T-Shirt""","""Cotton""",0.14,37.0,"""Method1""","""LayoutA""","""Good"""
"""RPT0499998""","""PRD07434""",2024-05-17,"""SupplierE""","""Pants""","""Cotton""",0.42,12.5,"""Method1""","""LayoutB""","""Good"""
"""RPT0499999""","""PRD04320""",2024-01-11,"""SupplierB""","""Dress""","""Cotton""",0.51,19.0,"""Method1""","""LayoutB""","""Good"""


### Uncertain in `PackagingQuality`

In the instructions we have this: PackagingQuality: Operational label indicating the packaging quality ("Good" or "Bad"),
based on predefined criteria. We shouldnt have `Uncertain` as an option, so we need to analyze this

In [None]:
# how many uncertain values with 'Uncertain' are there in PackagingQuality
uncertain_count = df_density_report.filter(pl.col('PackagingQuality') == 'Uncertain').shape[0]
print(f"Total uncertain values in PackagingQuality: {uncertain_count}")

#lets check if they are recent or have older dates
# Ensure 'DateOfReport' is of Date type if it's not already (it should be from cell 35)
if df_density_report['DateOfReport'].dtype != pl.Date:
     df_density_report = df_density_report.with_columns(
         pl.col('DateOfReport').cast(pl.Date)
     )

# Filter for 'Uncertain' quality and check date distribution
uncertain_dates = df_density_report.filter(pl.col('PackagingQuality') == 'Uncertain')['DateOfReport']

print(f"Date statistics for 'Uncertain' PackagingQuality")
if uncertain_dates.len() > 0:
    print(f"Minimum Date: {uncertain_dates.min()}")
    print(f"Maximum Date: {uncertain_dates.max()}")
    print(f"Number of unique dates: {uncertain_dates.n_unique()}")
    # You can add more analysis like plotting the distribution if needed
    # Example: uncertain_dates.value_counts().sort('DateOfReport').head()
else:
    print("No rows found with 'Uncertain' PackagingQuality.")


# Check the overall date range in the dataframe
min_date = df_density_report['DateOfReport'].min()
max_date = df_density_report['DateOfReport'].max()
print(f"Overall Date Range in df_density_report")
print(f"Minimum Date: {min_date}")
print(f"Maximum Date: {max_date}")
if min_date is not None and max_date is not None:
    date_range = max_date - min_date
    print(f"Total Date Range Span: {date_range}")
else:
     print("Could not calculate date range (min or max date is null).")


Total uncertain values in PackagingQuality: 1863
Date statistics for 'Uncertain' PackagingQuality
Minimum Date: 2023-01-01
Maximum Date: 2024-06-30
Number of unique dates: 526
Overall Date Range in df_density_report
Minimum Date: 2023-01-01
Maximum Date: 2024-06-30
Total Date Range Span: 546 days, 0:00:00


The dates are not an issue, its not the more recent packages that are `Uncertain`. Because of the importance of the column, we will leave them as  `null` for now.

## 2. Historical Incidents

In [None]:
# Read excel file of historical incidents
df_historical_incidents = pl.from_pandas(pd.read_excel(PATH_HISTORICAL_INCIDENTS))

In [None]:
df_historical_incidents.head()

ProductReference,SupplierName,DateOfIncident,IssueDescription,ResolutionStatus,CostImpact (€)
str,str,datetime[ns],str,str,f64
"""PRD08586""","""SupplierC""",2023-10-25 00:00:00,"""Other""","""Resolved""",69.0
"""PRD06004""","""SupplierA""",2024-03-07 00:00:00,"""Packaging Damage""","""Resolved""",1912.0
"""PRD04841""","""SupplierC""",2023-01-19 00:00:00,"""Missing Items""","""Resolved""",379.0
"""PRD02036""","""SupplierC""",2024-05-28 00:00:00,"""Other""","""In Progress""",327.0
"""PRD02537""","""SupplierE""",2023-08-11 00:00:00,"""Incorrect Folding""","""Not Resolved""",560.0


In [None]:
# now we will see the structure of the file
df_density_report.schema

Schema([('ReportID', String),
        ('ProductReference', String),
        ('DateOfReport', Date),
        ('SupplierName', String),
        ('GarmentType', String),
        ('Material', String),
        ('Weight', Float64),
        ('ProposedUnitsPerCarton', Float64),
        ('ProposedFoldingMethod', String),
        ('ProposedLayout', String),
        ('PackagingQuality', String)])

In [None]:
# do we have null values?
df_historical_incidents.null_count()

ProductReference,SupplierName,DateOfIncident,IssueDescription,ResolutionStatus,CostImpact (€)
u32,u32,u32,u32,u32,u32
0,0,0,0,0,0


In [None]:
# do we have repeated values?
num_duplicates = df_historical_incidents.is_duplicated().sum()
print(f"Number of duplicate rows found: {num_duplicates}")

Number of duplicate rows found: 0


We dont have nulls or duplicates, we will repeat the same process as before. We will start with the hour in `DateOfIncident`.

### Time of the day

In [None]:
# Check if any non-midnight times exist
has_non_midnight_incident = df_historical_incidents.select(
    (pl.col('DateOfIncident').dt.time() != time(0, 0, 0)).any()
).item()

print(f"Does 'DateOfIncident' have non-midnight times? {has_non_midnight_incident}")

if not has_non_midnight_incident:
    print("All times are midnight.")
    # Corrected line: Use df_historical_incidents instead of x
    df_historical_incidents = df_historical_incidents.with_columns(
        pl.col('DateOfIncident').cast(pl.Date)
    )
    # Optional: Verify the change
    print(f"New dtype of DateOfIncident: {df_historical_incidents['DateOfIncident'].dtype}")
else:
    print("Non-midnight times found. Keeping 'DateOfIncident' as Datetime.")

Does 'DateOfIncident' have non-midnight times? False
All times are midnight.
New dtype of DateOfIncident: Date


### Naming consistency

In [None]:
columns_to_check = [
    'SupplierName',
    'IssueDescription',
    'ResolutionStatus']

print("Checking Unique Values for Potential Inconsistencies in Historical Incidents")

for col_name in columns_to_check:
    if col_name in df_historical_incidents.columns:
        try:
            unique_values = (
                df_historical_incidents[col_name]
                .unique()
                .sort()
            )

            print(f"Unique values in: {col_name}")
            print(unique_values.to_list())
            print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

        except Exception as e:
            print(f"Could not process column: {col_name}")
            print(f"Error: {e}")
    else:
        print(f"Column not found: {col_name}")



Checking Unique Values for Potential Inconsistencies in Historical Incidents
Unique values in: SupplierName
['SPLF', 'SuplA', 'SupllierC', 'SuppB', 'SupplierA', 'SupplierB', 'SupplierC', 'SupplierD', 'SupplierE', 'SupplierF', 'SupplierG', 'SupplierH', 'supplierA', 'supplierh']
Total unique non-null values: 14
Unique values in: IssueDescription
['Incorrect Folding', 'Labeling Error', 'Missing Items', 'Other', 'Packaging Damage', 'Product Wrinkled', 'Transportation Damage']
Total unique non-null values: 7
Unique values in: ResolutionStatus
['In Progress', 'Not Resolved', 'Resolved']
Total unique non-null values: 3


Good, we only have wrong names in `SupplierName`, we will do the same as with the first dataset.

In [None]:
#fixing SupplierName
supplier_mapping_hi = {
    'SuplA': 'SupplierA',
    'supplierA': 'SupplierA',
    'SuppB': 'SupplierB',
    'SupllierC': 'SupplierC',
    'SPLF': 'SupplierF',
    'supplierh': 'SupplierH',
}

# Apply the mapping
df_historical_incidents = df_historical_incidents.with_columns(
    pl.col('SupplierName')
      .str.strip_chars()
      .replace(supplier_mapping_hi)
      .str.replace_all(" ", "")
      .alias('SupplierName')
)

# Check the unique values again
unique_values = (
    df_historical_incidents['SupplierName']
    .unique()
    .sort()
)
print(f"Unique values in: SupplierName")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

Unique values in: SupplierName
['SupplierA', 'SupplierB', 'SupplierC', 'SupplierD', 'SupplierE', 'SupplierF', 'SupplierG', 'SupplierH']
Total unique non-null values: 8


In [None]:
print("\nChecking statistics for 'CostImpact (€)':")
# Handle potential column name issues if the ' (€)' causes problems later, but describe should work.
try:
    cost_stats = df_historical_incidents.select(pl.col('CostImpact (€)')).describe()
    print(cost_stats) # .to_pandas().T might be easier to read if preferred

    # Specifically check min value
    min_cost = df_historical_incidents['CostImpact (€)'].min()
    print(f"\nMinimum CostImpact (€): {min_cost}")
    if min_cost is not None and min_cost < 0:
         print("Warning: Negative CostImpact found!")

except Exception as e:
    print(f"Error describing 'CostImpact (€)': {e}")
    print("Check if the column name is exactly 'CostImpact (€)'")


Checking statistics for 'CostImpact (€)':
shape: (9, 2)
┌────────────┬────────────────┐
│ statistic  ┆ CostImpact (€) │
│ ---        ┆ ---            │
│ str        ┆ f64            │
╞════════════╪════════════════╡
│ count      ┆ 18000.0        │
│ null_count ┆ 0.0            │
│ mean       ┆ 555.215833     │
│ std        ┆ 492.948178     │
│ min        ┆ 50.0           │
│ 25%        ┆ 224.0          │
│ 50%        ┆ 365.0          │
│ 75%        ┆ 693.75         │
│ max        ┆ 2500.0         │
└────────────┴────────────────┘

Minimum CostImpact (€): 50.0


## 3. Product Attributes

There is no need to clean this dataset

In [None]:
# Read excel file of product attributes
df_product_attributes = pl.from_pandas(pd.read_excel(PATH_PRODUCT_ATTRIBUTES))

In [None]:
df_product_attributes.head()

ProductReference,ProductName,GarmentType,Material,Size,Collection,Weight
str,str,str,str,str,str,f64
"""PRD00001""","""Jacket Cotton L""","""Jacket""","""Cotton""","""L""","""Summer""",0.84
"""PRD00002""","""Shorts Cotton S""","""Shorts""","""Cotton""","""S""","""Summer""",0.35
"""PRD00003""","""Sweater Cotton M""","""Sweater""","""Cotton""","""M""","""Spring""",0.85
"""PRD00004""","""Skirt Polyester L""","""Skirt""","""Polyester""","""L""","""Winter""",0.25
"""PRD00005""","""Shirt Polyester M""","""Shirt""","""Polyester""","""M""","""Spring""",0.16


In [None]:
# now we will see the structure of the file
df_product_attributes.schema

Schema([('ProductReference', String),
        ('ProductName', String),
        ('GarmentType', String),
        ('Material', String),
        ('Size', String),
        ('Collection', String),
        ('Weight', Float64)])

In [None]:
# do we have duplicate rows
num_duplicates = df_product_attributes.is_duplicated().sum()
print(f"Number of duplicate rows found: {num_duplicates}")
# do we have null values?
num_nulls = df_product_attributes.null_count()
print(f"Number of null values found: {num_nulls}")

Number of duplicate rows found: 0
Number of null values found: shape: (1, 7)
┌──────────────────┬─────────────┬─────────────┬──────────┬──────┬────────────┬────────┐
│ ProductReference ┆ ProductName ┆ GarmentType ┆ Material ┆ Size ┆ Collection ┆ Weight │
│ ---              ┆ ---         ┆ ---         ┆ ---      ┆ ---  ┆ ---        ┆ ---    │
│ u32              ┆ u32         ┆ u32         ┆ u32      ┆ u32  ┆ u32        ┆ u32    │
╞══════════════════╪═════════════╪═════════════╪══════════╪══════╪════════════╪════════╡
│ 0                ┆ 0           ┆ 0           ┆ 0        ┆ 0    ┆ 0          ┆ 0      │
└──────────────────┴─────────────┴─────────────┴──────────┴──────┴────────────┴────────┘


In [None]:
# Check if ProductReference is unique
is_prod_ref_unique = df_product_attributes['ProductReference'].is_unique().all()
print(f"Is ProductReference unique across all rows? {is_prod_ref_unique}")
if not is_prod_ref_unique:
     non_unique_prod_refs = df_product_attributes.group_by('ProductReference').agg(pl.count()).filter(pl.col('count') > 1)
     print(f"Number of non-unique ProductReferences: {non_unique_prod_refs.height}")


Is ProductReference unique across all rows? True


In [None]:
columns_to_check = [
    'GarmentType',
    'Material',
    'Size',
    'Collection']

print("Checking Unique Values for Potential Inconsistencies in Historical Incidents")

for col_name in columns_to_check:
    if col_name in df_product_attributes.columns:
        try:
            unique_values = (
                df_product_attributes[col_name]
                .unique()
                .sort()
            )

            print(f"Unique values in: {col_name}")
            print(unique_values.to_list())
            print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

        except Exception as e:
            print(f"Could not process column: {col_name}")
            print(f"Error: {e}")
    else:
        print(f"Column not found: {col_name}")



Checking Unique Values for Potential Inconsistencies in Historical Incidents
Unique values in: GarmentType
['Blouse', 'Coat', 'Dress', 'Hoodie', 'Jacket', 'Pants', 'Shirt', 'Shorts', 'Skirt', 'Suit', 'Sweater', 'T-Shirt']
Total unique non-null values: 12
Unique values in: Material
['Cotton', 'Denim', 'Linen', 'Polyester', 'Silk', 'Wool']
Total unique non-null values: 6
Unique values in: Size
['L', 'M', 'S', 'XL', 'XS']
Total unique non-null values: 5
Unique values in: Collection
['Autumn', 'Spring', 'Summer', 'Winter']
Total unique non-null values: 4


## 4. Supplier Scorecard

In [None]:
# Read excel file of supplier scorecard
df_supplier_scorecard = pl.from_pandas(pd.read_excel(PATH_SUPPLIER_SCORECARD))

In [None]:
df_supplier_scorecard.head()

SupplierName,Month,PackagesHandled,BadPackagingRate (%),TotalIncidents,AverageCostPerIncident (€),OnTimeDeliveryRate (%),AnomaliesDetected
str,str,i64,f64,i64,f64,f64,i64
"""SupplierA""","""2023-01""",7841,8.46,133,538.23,86.01,23
"""SupplierA""","""2023-02""",7196,7.78,153,572.14,88.09,16
"""SupplierA""","""2023-03""",7842,7.94,163,547.51,84.74,21
"""SupplierA""","""2023-04""",7587,7.7,158,588.33,91.58,18
"""SupplierA""","""2023-05""",8010,7.77,166,618.34,96.87,18


In [None]:
df_supplier_scorecard.describe()

statistic,SupplierName,Month,PackagesHandled,BadPackagingRate (%),TotalIncidents,AverageCostPerIncident (€),OnTimeDeliveryRate (%),AnomaliesDetected
str,str,str,f64,f64,f64,f64,f64,f64
"""count""","""252""","""252""",252.0,252.0,252.0,252.0,252.0,252.0
"""null_count""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,,1984.126984,23.04754,70.436508,552.654524,75.397857,11.698413
"""std""",,,2382.57589,9.242888,71.666892,215.370064,8.923621,12.036644
"""min""","""SPLF""","""2023-01""",66.0,7.29,0.0,0.0,60.0,0.0
"""25%""",,,95.0,16.67,3.0,484.78,68.88,1.0
"""50%""",,,1343.0,21.05,46.0,544.72,76.46,9.0
"""75%""",,,2957.0,27.46,149.0,611.59,81.84,20.0
"""max""","""supplierh""","""2024-06""",8019.0,45.85,210.0,1969.0,96.87,43.0


In [None]:
#now we will see the structure of the file
df_supplier_scorecard.schema

Schema([('SupplierName', String),
        ('Month', String),
        ('PackagesHandled', Int64),
        ('BadPackagingRate (%)', Float64),
        ('TotalIncidents', Int64),
        ('AverageCostPerIncident (€)', Float64),
        ('OnTimeDeliveryRate (%)', Float64),
        ('AnomaliesDetected', Int64)])

In [None]:
# do we have duplicate rows
num_duplicates = df_supplier_scorecard.is_duplicated().sum()
print(f"Number of duplicate rows found: {num_duplicates}")
# do we have null values?
num_nulls = df_supplier_scorecard.null_count()
print(f"Number of null values found: {num_nulls}")


Number of duplicate rows found: 0
Number of null values found: shape: (1, 8)
┌────────────┬───────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ SupplierNa ┆ Month ┆ PackagesHa ┆ BadPackagi ┆ TotalIncid ┆ AverageCos ┆ OnTimeDeli ┆ AnomaliesD │
│ me         ┆ ---   ┆ ndled      ┆ ngRate (%) ┆ ents       ┆ tPerIncide ┆ veryRate   ┆ etected    │
│ ---        ┆ u32   ┆ ---        ┆ ---        ┆ ---        ┆ nt (€)     ┆ (%)        ┆ ---        │
│ u32        ┆       ┆ u32        ┆ u32        ┆ u32        ┆ ---        ┆ ---        ┆ u32        │
│            ┆       ┆            ┆            ┆            ┆ u32        ┆ u32        ┆            │
╞════════════╪═══════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╡
│ 0          ┆ 0     ┆ 0          ┆ 0          ┆ 0          ┆ 0          ┆ 0          ┆ 0          │
└────────────┴───────┴────────────┴────────────┴────────────┴────────────┴────────────┴────────────┘


In [None]:

# Identify the combinations that are duplicated
duplicated_keys = (
    df_supplier_scorecard
    .group_by(['SupplierName', 'Month'])
    .agg(pl.len().alias('count'))
    .filter(pl.col('count') > 1)
    .select(['SupplierName', 'Month']) # Select only the key columns
)

# Join back to the original data to get ALL columns for the duplicated keys
# Use an inner join to only get rows matching the duplicated keys
view_duplicates = duplicated_keys.join(
    df_supplier_scorecard,
    on=['SupplierName', 'Month'],
    how='inner'
).sort(['SupplierName', 'Month']) # Sort to see duplicates grouped together

print(f"Viewing rows where (SupplierName, Month) combinations are duplicated (showing first ~10 duplicates):")

# Show enough rows to see a few examples of duplicates
print(view_duplicates.head(10))

# You might want to look at a specific example found earlier, e.g., SupplierA / 2023-02-01
# print("\nExample: SupplierA for 2023-02-01")
# print(df_supplier_scorecard.filter(
#     (pl.col('SupplierName') == 'SupplierA') & (pl.col('Month') == pl.lit(date(2023, 2, 1))) # Need lit() and date object if comparing directly
# ))

Viewing rows where (SupplierName, Month) combinations are duplicated (showing first ~10 duplicates):
shape: (0, 8)
┌────────────┬───────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ SupplierNa ┆ Month ┆ PackagesHa ┆ BadPackagi ┆ TotalIncid ┆ AverageCos ┆ OnTimeDeli ┆ AnomaliesD │
│ me         ┆ ---   ┆ ndled      ┆ ngRate (%) ┆ ents       ┆ tPerIncide ┆ veryRate   ┆ etected    │
│ ---        ┆ str   ┆ ---        ┆ ---        ┆ ---        ┆ nt (€)     ┆ (%)        ┆ ---        │
│ str        ┆       ┆ i64        ┆ f64        ┆ i64        ┆ ---        ┆ ---        ┆ i64        │
│            ┆       ┆            ┆            ┆            ┆ f64        ┆ f64        ┆            │
╞════════════╪═══════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╡
└────────────┴───────┴────────────┴────────────┴────────────┴────────────┴────────────┴────────────┘


### Time of day

In [None]:
print(f"Original dtype of Month: {df_supplier_scorecard['Month'].dtype}")

try:
    # Convert 'YYYY-MM' string to Date (defaults to 1st of the month)
    df_supplier_scorecard = df_supplier_scorecard.with_columns(
        pl.col("Month").str.to_date("%Y-%m").alias("Month")
    )
    # Verify the change
    print(f"New dtype of Month: {df_supplier_scorecard['Month'].dtype}")
    print("Successfully converted Month column to Date type.")
    print("\nSample data after Month conversion:")
    print(df_supplier_scorecard.head(3))

except Exception as e:
    print(f"Error converting Month column: {e}")
    print("Please check if all values strictly follow the 'YYYY-MM' format.")

Original dtype of Month: String
New dtype of Month: Date
Successfully converted Month column to Date type.

Sample data after Month conversion:
shape: (3, 8)
┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ SupplierNa ┆ Month      ┆ PackagesHa ┆ BadPackag ┆ TotalInci ┆ AverageCo ┆ OnTimeDel ┆ Anomalies │
│ me         ┆ ---        ┆ ndled      ┆ ingRate   ┆ dents     ┆ stPerInci ┆ iveryRate ┆ Detected  │
│ ---        ┆ date       ┆ ---        ┆ (%)       ┆ ---       ┆ dent (€)  ┆ (%)       ┆ ---       │
│ str        ┆            ┆ i64        ┆ ---       ┆ i64       ┆ ---       ┆ ---       ┆ i64       │
│            ┆            ┆            ┆ f64       ┆           ┆ f64       ┆ f64       ┆           │
╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ SupplierA  ┆ 2023-01-01 ┆ 7841       ┆ 8.46      ┆ 133       ┆ 538.23    ┆ 86.01     ┆ 23        │
│ SupplierA  ┆ 2023-02-01 ┆ 7196  

### Naming consistency

In [None]:
columns_to_check = [
    'SupplierName']

print("Checking Unique Values for Potential Inconsistencies in Historical Incidents")

for col_name in columns_to_check:
    if col_name in df_supplier_scorecard.columns:
        try:
            unique_values = (
                df_supplier_scorecard[col_name]
                .unique()
                .sort()
            )

            print(f"Unique values in: {col_name}")
            print(unique_values.to_list())
            print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

        except Exception as e:
            print(f"Could not process column: {col_name}")
            print(f"Error: {e}")
    else:
        print(f"Column not found: {col_name}")


Checking Unique Values for Potential Inconsistencies in Historical Incidents
Unique values in: SupplierName
['SPLF', 'SuplA', 'SupllierC', 'SuppB', 'SupplierA', 'SupplierB', 'SupplierC', 'SupplierD', 'SupplierE', 'SupplierF', 'SupplierG', 'SupplierH', 'supplierA', 'supplierh']
Total unique non-null values: 14


In [None]:
#fixing SupplierName
supplier_mapping_ss = {
    'SuplA': 'SupplierA',
    'supplierA': 'SupplierA',
    'SuppB': 'SupplierB',
    'SupllierC': 'SupplierC',
    'SPLF': 'SupplierF',
    'supplierh': 'SupplierH',
}

# Apply the mapping
df_supplier_scorecard = df_supplier_scorecard.with_columns(
    pl.col('SupplierName')
      .str.strip_chars()
      .replace(supplier_mapping_ss)
      .str.replace_all(" ", "")
      .alias('SupplierName')
)

# Check the unique values again
unique_values = (
    df_supplier_scorecard['SupplierName']
    .unique()
    .sort()
)
print(f"Unique values in: SupplierName")
print(unique_values.to_list())
print(f"Total unique non-null values: {len(unique_values.drop_nulls())}")

Unique values in: SupplierName
['SupplierA', 'SupplierB', 'SupplierC', 'SupplierD', 'SupplierE', 'SupplierF', 'SupplierG', 'SupplierH']
Total unique non-null values: 8


### Data Duplicates

In [None]:

# Identify the combinations that are duplicated
duplicated_keys = (
    df_supplier_scorecard
    .group_by(['SupplierName', 'Month'])
    .agg(pl.len().alias('count'))
    .filter(pl.col('count') > 1)
    .select(['SupplierName', 'Month']) # Select only the key columns
)

# Join back to the original data to get ALL columns for the duplicated keys
# Use an inner join to only get rows matching the duplicated keys
view_duplicates = duplicated_keys.join(
    df_supplier_scorecard,
    on=['SupplierName', 'Month'],
    how='inner'
).sort(['SupplierName', 'Month']) # Sort to see duplicates grouped together

print(f"Viewing rows where (SupplierName, Month) combinations are duplicated (showing first ~10 duplicates):")

# Show enough rows to see a few examples of duplicates
print(view_duplicates.head(10))


Viewing rows where (SupplierName, Month) combinations are duplicated (showing first ~10 duplicates):
shape: (10, 8)
┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ SupplierNa ┆ Month      ┆ PackagesHa ┆ BadPackag ┆ TotalInci ┆ AverageCo ┆ OnTimeDel ┆ Anomalies │
│ me         ┆ ---        ┆ ndled      ┆ ingRate   ┆ dents     ┆ stPerInci ┆ iveryRate ┆ Detected  │
│ ---        ┆ date       ┆ ---        ┆ (%)       ┆ ---       ┆ dent (€)  ┆ (%)       ┆ ---       │
│ str        ┆            ┆ i64        ┆ ---       ┆ i64       ┆ ---       ┆ ---       ┆ i64       │
│            ┆            ┆            ┆ f64       ┆           ┆ f64       ┆ f64       ┆           │
╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ SupplierA  ┆ 2023-01-01 ┆ 102        ┆ 19.61     ┆ 1         ┆ 261.0     ┆ 74.56     ┆ 0         │
│ SupplierA  ┆ 2023-01-01 ┆ 76         ┆ 14.47     ┆ 3         ┆ 591.0     ┆

## Export

In [None]:
df_density_report.write_csv(EXPORT_DENSITY_REPORT, separator=";")
df_historical_incidents.write_csv(EXPORT_HISTORICAL_INCIDENTS, separator=";")
df_product_attributes.write_csv(EXPORT_PRODUCT_ATTRIBUTES, separator=";")
df_supplier_scorecard.write_csv(EXPORT_SUPPLIER_SCORECARD, separator=";")