# Data Integration

This code is specically desgined to integrate the data in a unique dataframe and be able to use it in a model

## Libraries import

In [1]:
import pandas as pd
import polars as pl
import polars.selectors as cs


  from pandas.core import (


In [2]:
# get current working directory
import os

current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

#change from  /Users/ulisesgordillo/Downloads/capstone/notebooks to  /Users/ulisesgordillo/Downloads/capstone/data
os.chdir('/Users/ulisesgordillo/Downloads/capstone/csv')
print(f"Changed working directory to: {os.getcwd()}")

Current working directory: /Users/ulisesgordillo/Downloads/capstone/notebooks
Changed working directory to: /Users/ulisesgordillo/Downloads/capstone/csv


## Global variables

In [3]:
PATH_DENSITY_REPORT       = 'density_report.csv'
PATH_HISTORICAL_INCIDENTS = 'historical_incidents.csv'
PATH_PRODUCT_ATTRIBUTES   = 'product_attributes.csv'
PATH_SUPPLIER_SCORECARD   = 'supplier_scorecard.csv'

EXPORT_TOTAL_DATAFRAME    = 'total_dataframe.csv'

## Execution

### 1. Read files

In [4]:
# Reading csv files
df_density_report = pl.read_csv(PATH_DENSITY_REPORT, separator=";")
df_historical_incidents = pl.read_csv(PATH_HISTORICAL_INCIDENTS, separator=";")
df_product_attributes = pl.read_csv(PATH_PRODUCT_ATTRIBUTES, separator=";")
df_supplier_scorecard = pl.read_csv(PATH_SUPPLIER_SCORECARD, separator=";")

### 2. Join files

In [5]:
print("Step 1 & 2: Initialize Master and Join ProductAttributes...")
# Rename product weight column
df_product_attributes_renamed = df_product_attributes.rename({'Weight': 'ProductWeight'})

# Initialize and join
df_master = df_density_report.clone().join(
    df_product_attributes_renamed.select(
        "ProductReference", "ProductName", "Size", "Collection", "ProductWeight"
    ).drop_nulls(subset=['ProductReference']),
    on='ProductReference',
    how='left'
)
print(f"   Shape after joining ProductAttributes: {df_master.shape}")

Step 1 & 2: Initialize Master and Join ProductAttributes...
   Shape after joining ProductAttributes: (500000, 15)


In [6]:
print("\nStep 3: Joining SupplierScorecard...")

# Ensure DateOfReport is Date/Datetime Type
try:
    df_master = df_master.with_columns(pl.col("DateOfReport").cast(pl.Date))
except Exception:
    try:
         df_master = df_master.with_columns(pl.col("DateOfReport").cast(pl.Datetime))
    except Exception as e:
         print(f"ERROR: Failed to cast DateOfReport: {e}")
         # Consider raising error if date is critical

# Create YYYY-MM key
df_master = df_master.with_columns(
    pl.col("DateOfReport").dt.strftime("%Y-%m").alias("ReportYearMonth")
)

# Prefix Scorecard columns
scorecard_cols = df_supplier_scorecard.columns
scorecard_cols_prefixed = {
    col: f"SC_{col}" for col in scorecard_cols if col not in ['SupplierName', 'Month']
}
df_supplier_scorecard_prefixed = df_supplier_scorecard.rename(scorecard_cols_prefixed)

# Join
df_master = df_master.join(
    df_supplier_scorecard_prefixed,
    left_on=['SupplierName', 'ReportYearMonth'],
    right_on=['SupplierName', 'Month'],
    how='left'
)

# Simple Null Handling for Scorecard columns: Fill numeric nulls with 0
sc_numeric_cols = df_master.select(cs.starts_with("SC_") & (cs.numeric())).columns
if sc_numeric_cols:
    df_master = df_master.with_columns(
        [pl.col(c).fill_null(0) for c in sc_numeric_cols]
    )
    print("   (Filled numeric SC_ column nulls with 0)")

# Drop intermediate key
df_master = df_master.drop("ReportYearMonth")
print(f"   Shape after joining SupplierScorecard: {df_master.shape}")


Step 3: Joining SupplierScorecard...
   (Filled numeric SC_ column nulls with 0)
   Shape after joining SupplierScorecard: (500000, 21)


In [7]:
# --- Start of Corrected Cell 4 ---

print("\nStep 4: Simple Aggregation and Join of HistoricalIncidents...")

# Optional: Verify column names if needed (can be commented out after confirming)
# print(f"Columns in df_historical_incidents: {df_historical_incidents.columns}")

# Aggregate Incidents by ProductReference ONLY (total lifetime stats)
df_incidents_agg_simple = df_historical_incidents.group_by("ProductReference").agg(
    pl.len().alias("HistInc_TotalCount_Product"),
    # Use the correct column name here: "CostImpact (€)"
    pl.sum("CostImpact (€)").fill_null(0).alias("HistInc_TotalCost_Product")
).drop_nulls(subset=["ProductReference"]) # Drop if ProductReference was null

# Join the simple aggregates
df_master = df_master.join(
    df_incidents_agg_simple,
    on="ProductReference",
    how="left"
)

# Simple Null Handling: Fill incident aggregates with 0 (means no history for product)
hist_inc_cols = ["HistInc_TotalCount_Product", "HistInc_TotalCost_Product"]
df_master = df_master.with_columns(
    [pl.col(c).fill_null(0) for c in hist_inc_cols if c in df_master.columns]
)
print("   (Filled HistInc_ column nulls with 0)")

# Print final shape for this step
print(f"   Shape after joining Aggregated Incidents: {df_master.shape}")




Step 4: Simple Aggregation and Join of HistoricalIncidents...
   (Filled HistInc_ column nulls with 0)
   Shape after joining Aggregated Incidents: (500000, 23)


## Export

In [8]:
# Step 5: Export the Final Master Table
print(f"\nStep 5: Exporting Final Master Table to {EXPORT_TOTAL_DATAFRAME}...")

try:
    # Use df_master directly here
    df_master.write_csv(EXPORT_TOTAL_DATAFRAME, separator=";")
    print(f"   Successfully exported master table with shape {df_master.shape}.")
except Exception as e:
    print(f"   Error exporting master table: {e}")


Step 5: Exporting Final Master Table to total_dataframe.csv...
   Successfully exported master table with shape (500000, 23).
