# Data cleaning

This code is specically desgined to clean the data provided by CapGemini for the Capstone Project 2025

## Libraries import

In [1]:
import pandas as pd
import polars as pl

## Global variables

In [14]:
PATH_DENSITY_REPORT       = 'DensityReports_500k.xlsx'
PATH_HISTORICAL_INCIDENTS = 'HistoricalIncidents.xlsx'
PATH_PRODUCT_ATTRIBUTES   = 'ProductAttributes.xlsx'
PATH_SUPPLIER_SCORECARD   = 'SupplierScorecard.xlsx'

EXPORT_DENSITY_REPORT       = 'density_report.csv'
EXPORT_HISTORICAL_INCIDENTS = 'historical_incidents.csv'
EXPORT_PRODUCT_ATTRIBUTES   = 'product_attributes.csv'
EXPORT_SUPPLIER_SCORECARD   = 'supplier_scorecard.csv'

## Execution

### 1. Density Report

In [3]:
# Read excel file of density report
df_density_report = pl.from_pandas(pd.read_excel(PATH_DENSITY_REPORT))

# Suppliers' strings corrections
corrections = {
    "SuplierA": "SupplierA",
    "SuplierB": "SupplierB",
    "SuplierC": "SupplierC"
}

df_density_report = df_density_report.with_columns([
    pl.col("SupplierName").replace(corrections).alias("SupplierName")
])

# Replace incorrect unit values to null
df_density_report = df_density_report.with_columns([
    pl.when(
        (pl.col("ProposedUnitsPerCarton") <= -5) | (pl.col("ProposedUnitsPerCarton") >= 50)
    )
    .then(None)
    .otherwise(pl.col("ProposedUnitsPerCarton"))
    .alias("ProposedUnitsPerCarton")
])

# Replace the "Unknown" values to null
df_density_report = df_density_report.with_columns([
    pl.when(pl.col("ProposedFoldingMethod") == "Unknown")
    .then(None)
    .otherwise(pl.col("ProposedFoldingMethod"))
    .alias("ProposedFoldingMethod")
])

# Drop the last column which has the same value on all the records
df_density_report = df_density_report.drop("DataLabeled")

### 2. Historical Incidents

There is no need to clean this dataset

In [10]:
# Read excel file of historical incidents
df_historical_incidents = pl.from_pandas(pd.read_excel(PATH_HISTORICAL_INCIDENTS))

### 3. Product Attributes

There is no need to clean this dataset

In [12]:
# Read excel file of product attributes
df_product_attributes = pl.from_pandas(pd.read_excel(PATH_PRODUCT_ATTRIBUTES))

### 4. Supplier Scorecard

In [16]:
# Read excel file of supplier scorecard
df_supplier_scorecard = pl.from_pandas(pd.read_excel(PATH_SUPPLIER_SCORECARD))

## Export

In [17]:
df_density_report.write_csv(EXPORT_DENSITY_REPORT, separator=";")
df_historical_incidents.write_csv(EXPORT_HISTORICAL_INCIDENTS, separator=";")
df_product_attributes.write_csv(EXPORT_PRODUCT_ATTRIBUTES, separator=";")
df_supplier_scorecard.write_csv(EXPORT_SUPPLIER_SCORECARD, separator=";")