# Data Integration

This code is specically desgined to integrate the data in a unique dataframe and be able to use it in a model

## Libraries import

In [1]:
import pandas as pd
import polars as pl

  from pandas.core import (


## Global variables

In [3]:
PATH_DENSITY_REPORT       = 'density_report.csv'
PATH_HISTORICAL_INCIDENTS = 'historical_incidents.csv'
PATH_PRODUCT_ATTRIBUTES   = 'product_attributes.csv'
PATH_SUPPLIER_SCORECARD   = 'supplier_scorecard.csv'

EXPORT_TOTAL_DATAFRAME    = 'total_dataframe.csv'

## Execution

### 1. Read files

In [4]:
# Reading csv files
df_density_report = pl.read_csv(PATH_DENSITY_REPORT, separator=";")
df_historical_incidents = pl.read_csv(PATH_HISTORICAL_INCIDENTS, separator=";")
df_product_attributes = pl.read_csv(PATH_PRODUCT_ATTRIBUTES, separator=";")
df_supplier_scorecard = pl.read_csv(PATH_SUPPLIER_SCORECARD, separator=";")

# Adjusting date values
df_density_report = df_density_report.with_columns([
    pl.col("DateOfReport").str.strptime(pl.Date, format="%Y-%m-%dT%H:%M:%S.%f")
])

df_historical_incidents = df_historical_incidents.with_columns([
    pl.col("DateOfIncident").str.strptime(pl.Date, format="%Y-%m-%dT%H:%M:%S.%f")
])



  pl.col("DateOfReport").str.strptime(pl.Date, format="%Y-%m-%dT%H:%M:%S.%f")


InvalidOperationError: conversion from `str` to `date` failed in column 'DateOfReport' for 3125 out of 3125 values: ["2023-01-30", "2023-01-14", … "2024-02-02"]

You might want to try:
- setting `strict=False` to set values that cannot be converted to `null`
- using `str.strptime`, `str.to_date`, or `str.to_datetime` and providing a format string

In [None]:
#  Read CSV files 
df_density_report = pl.read_csv(PATH_DENSITY_REPORT, separator=";")
df_historical_incidents = pl.read_csv(PATH_HISTORICAL_INCIDENTS, separator=";")
df_product_attributes = pl.read_csv(PATH_PRODUCT_ATTRIBUTES, separator=";")
df_supplier_scorecard = pl.read_csv(PATH_SUPPLIER_SCORECARD, separator=";")


# --- Adjust date values for df_density_report ---
print("Adjusting DateOfReport in df_density_report...")
df_density_report = df_density_report.with_columns(
    pl.col("DateOfReport")
      .str.strptime(pl.Date, format="%Y-%m-%d", strict=False) # Parse string directly to Date with the correct format
      .alias("DateOfReport")                                  # Overwrite column (optional)
)
print(f"New dtype: {df_density_report['DateOfReport'].dtype}")
print(df_density_report.head(2)) # Show result


# --- Adjust date values for df_historical_incidents ---
print("\nAdjusting DateOfIncident in df_historical_incidents...")
df_historical_incidents = df_historical_incidents.with_columns(
    pl.col("DateOfIncident")
      .str.strptime(pl.Date, format="%Y-%m-%d", strict=False) # Parse string directly to Date with the correct format
      .alias("DateOfIncident")                                # Overwrite column
)
print(f"New dtype: {df_historical_incidents['DateOfIncident'].dtype}")
print(df_historical_incidents.head(2)) # Show result



Adjusting DateOfReport in df_density_report...
New dtype: Date
shape: (2, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ ReportID  ┆ ProductRe ┆ DateOfRep ┆ SupplierN ┆ … ┆ ProposedU ┆ ProposedF ┆ ProposedL ┆ Packagin │
│ ---       ┆ ference   ┆ ort       ┆ ame       ┆   ┆ nitsPerCa ┆ oldingMet ┆ ayout     ┆ gQuality │
│ str       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ rton      ┆ hod       ┆ ---       ┆ ---      │
│           ┆ str       ┆ date      ┆ str       ┆   ┆ ---       ┆ ---       ┆ str       ┆ str      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆ str       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ RPT000000 ┆ PRD07271  ┆ 2024-03-0 ┆ SupplierA ┆ … ┆ 29.0      ┆ Method2   ┆ LayoutC   ┆ Good     │
│ 1         ┆           ┆ 4         ┆           ┆   ┆           ┆           ┆           ┆          │
│ RPT000000 ┆

### 2. Join files

In [7]:
# Put the same name on the date columns
df_density_report       = df_density_report.rename({"DateOfReport": "Date"})
df_historical_incidents = df_historical_incidents.rename({"DateOfIncident": "Date"})

# Drop repeated columns
df_product_attributes = df_product_attributes.drop(["GarmentType","Material"])

# Join the dataframes
df_joined = df_density_report.join(
    df_historical_incidents,
    on=["Date", "SupplierName", "ProductReference"],
    how="left"
)

df_joined = df_joined.join(
    df_product_attributes,
    on=["ProductReference"],
    how="left"
)

## Export

In [8]:
df_joined.write_csv(EXPORT_TOTAL_DATAFRAME, separator=";")