# Data Preprocessing 

### Dynamic file path updation

In [1]:
# Read location_variables.txt and extract file paths
location_file = "location_variables.txt"

# Read the file and store variables in a dictionary
with open(location_file, "r") as file:
    locations = {line.split(" = ")[0]: line.split(" = ")[1].strip().strip('"') for line in file}

# Extract relevant paths
data_filepath = locations["data_filepath"]  # Latest input data path
preprocessed_path = locations["preprocessed_path"]  # Where to save preprocessed data

print("Using Data Path:", data_filepath)
print("Saving Preprocessed Data To:", preprocessed_path)


Using Data Path: ./Input_Data/04. April 2025/10_04_2025/2025_04_10_13_03_07 input data/
Saving Preprocessed Data To: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/


#### Load Datasets

In [2]:
# Step 2: Define Required Columns & Load Data
import pandas as pd
import os
required_columns = {
    "input_Actual_production.csv": ["plant_sk", "plant_code", "material_sk", "material_code", "actual_quantity"],
    "input_Inventory_policy.csv": ["material_sk", "plant_sk", "plant_code", "material_code", "target_opening_stock", "maximum_stock"],
    "input_lcp_data.csv": ["material_sk", "lcp_rank", "material_code", "origin_location_code", "destination_location_code","release_week_id","ins_gmt_ts","origin_plant_sk","destination_plant_sk"],
    "input_Planned_loads.csv": ["delivery_sk", "actual_quantity", "unit_of_measure_code", "plant_code",
                                "transportation_planning_priority", "material_sk", "material_code", "load_id", 
                                "origin_slot_arrival", "origin_slot_departure", "destination_slot_arrival", 
                                "destination_slot_departure", "RFRC_NUM12", "origin", "destination", "movement_type"],
    "input_Planned_production.csv": ["duration", "end_outflow_ts", "material_sk", "material_code", "plant_sk", 
                                     "plant_code", "product_unit", "quantity", "start_inflow_ts", "status"],
    "input_stock.csv": ["material_code", "material_sk", "storage_location_code", "plant_sk", "plant_code", "valuated_stock_hl","shelf_life_expiration_ts"],
    "input_UOM_full.csv": ["material_code", "material_sk", "target_unit_of_measure_code", "conversion_numerator", "conversion_denumerator"],
    "input_UOM_weight.csv": ["material_code", "pal_weight_kg"]
}

# Load datasets
dataframes = {}
for file, columns in required_columns.items():
    file_path = os.path.join(data_filepath, file)
    
    if os.path.exists(file_path):
        dataframes[file] = pd.read_csv(file_path, usecols=columns, dtype=str, low_memory=False)
        print(f"Loaded {file} - Shape: {dataframes[file].shape}")
    else:
        print(f"Warning: {file} not found!")

# Display sample output
for file, df in dataframes.items():
    print(f"\n{file} - First 3 rows:")
    display(df.head(3))


Loaded input_Actual_production.csv - Shape: (66, 5)
Loaded input_Inventory_policy.csv - Shape: (5277, 6)


Loaded input_lcp_data.csv - Shape: (80898, 9)


Loaded input_Planned_loads.csv - Shape: (2890, 16)
Loaded input_Planned_production.csv - Shape: (637, 10)


Loaded input_stock.csv - Shape: (69032, 7)


Loaded input_UOM_full.csv - Shape: (126813, 5)
Loaded input_UOM_weight.csv - Shape: (12684, 2)

input_Actual_production.csv - First 3 rows:


Unnamed: 0,plant_sk,plant_code,material_sk,material_code,actual_quantity
0,573,DE05,1814,5687,5840.0
1,16369,CI01,16135682,9950043,8400.0
2,1692,DE08,2364,6085,5520.0



input_Inventory_policy.csv - First 3 rows:


Unnamed: 0,material_sk,plant_sk,plant_code,material_code,target_opening_stock,maximum_stock
0,1876258,1941,NL03,107494,300.0,810.210226
1,3480,2566,DE02,6365,47.968987,240.031882
2,59933,2641,NL08,94885,0.414761,1.121469



input_lcp_data.csv - First 3 rows:


Unnamed: 0,material_sk,origin_plant_sk,destination_plant_sk,lcp_rank,release_week_id,material_code,origin_location_code,destination_location_code,ins_gmt_ts
0,3769,573,2566,2,202451,11524,DE05,DE02,20241227124946
1,3480,573,520,1,202451,6365,DE05,DE12,20241227124946
2,1899,573,4766,1,202451,6590,DE05,DE06,20241227124946



input_Planned_loads.csv - First 3 rows:


Unnamed: 0,delivery_sk,material_sk,actual_quantity,unit_of_measure_code,plant_code,transportation_planning_priority,material_code,load_id,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,RFRC_NUM12,origin,destination,movement_type
0,81728721,1977870,4680.0,PC,GB01,1,99150,34718714,2025-04-10 08:00:00,2025-04-10 08:30:00,,,21820530,GB01,29585614,SO
1,81649724,1869080,26.0,PC,GB67,1,108741,34714811,2025-04-10 16:00:00,2025-04-10 17:00:00,,,21806479,GB67,29586301,SO
2,81697215,1978473,450.0,PC,GB02,1,99148,34716589,2025-04-10 15:00:00,2025-04-10 15:30:00,,,21816790,GB02,29586299,SO



input_Planned_production.csv - First 3 rows:


Unnamed: 0,duration,end_outflow_ts,material_sk,material_code,plant_sk,plant_code,product_unit,quantity,start_inflow_ts,status
0,600.0,2025-04-10 09:00:00,49997,62275,482,GB01,PCE,39480.0,2025-04-09 23:00:00,processorder
1,116.0,2025-04-11 02:17:00,16116094,99787_RTL,85,BE02,HLT,580.0,2025-04-11 00:21:00,NO STATUS
2,1407.0,2025-04-12 05:27:00,1895487,104115,2566,DE02,PCE,30500.0,2025-04-11 06:00:00,processorder



input_stock.csv - First 3 rows:


Unnamed: 0,material_code,material_sk,storage_location_code,plant_sk,plant_code,valuated_stock_hl,shelf_life_expiration_ts
0,66638,41950,4014,13688,IT12,9.0,2025-04-05 00:00:00
1,7587279,130491,1010,133,BE06,1.42,1900-01-01 00:00:00
2,66638,41950,4015,13688,IT12,21.6,2025-04-05 00:00:00



input_UOM_full.csv - First 3 rows:


Unnamed: 0,material_code,material_sk,target_unit_of_measure_code,conversion_numerator,conversion_denumerator
0,185,92,PAL,70.0,1.0
1,80,10,PAL,8.0,1.0
2,26,6,PAL,70.0,1.0



input_UOM_weight.csv - First 3 rows:


Unnamed: 0,material_code,pal_weight_kg
0,100298,1005.76
1,100584,936.0
2,100787,928.0


#### Cleaning column names and standardization

In [3]:
# Step 3: Data Cleaning & Standardization

# Function to clean column names & values
def clean_dataframe(df):
    df.columns = df.columns.str.strip()  # Remove leading/trailing spaces in column names
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Strip spaces in string values
    return df

# Apply cleaning to all datasets
for file, df in dataframes.items():
    dataframes[file] = clean_dataframe(df)
    print(f"Cleaned {file}")

# Verify updates
for file, df in dataframes.items():
    print(f"\n{file} - Sample Data After Cleaning:")
    display(df.head(3))


Cleaned input_Actual_production.csv
Cleaned input_Inventory_policy.csv


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Strip spaces in string values
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Strip spaces in string values
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Strip spaces in string values


Cleaned input_lcp_data.csv
Cleaned input_Planned_loads.csv
Cleaned input_Planned_production.csv


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Strip spaces in string values
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Strip spaces in string values
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Strip spaces in string values


Cleaned input_stock.csv


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Strip spaces in string values


Cleaned input_UOM_full.csv
Cleaned input_UOM_weight.csv

input_Actual_production.csv - Sample Data After Cleaning:


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Strip spaces in string values


Unnamed: 0,plant_sk,plant_code,material_sk,material_code,actual_quantity
0,573,DE05,1814,5687,5840.0
1,16369,CI01,16135682,9950043,8400.0
2,1692,DE08,2364,6085,5520.0



input_Inventory_policy.csv - Sample Data After Cleaning:


Unnamed: 0,material_sk,plant_sk,plant_code,material_code,target_opening_stock,maximum_stock
0,1876258,1941,NL03,107494,300.0,810.210226
1,3480,2566,DE02,6365,47.968987,240.031882
2,59933,2641,NL08,94885,0.414761,1.121469



input_lcp_data.csv - Sample Data After Cleaning:


Unnamed: 0,material_sk,origin_plant_sk,destination_plant_sk,lcp_rank,release_week_id,material_code,origin_location_code,destination_location_code,ins_gmt_ts
0,3769,573,2566,2,202451,11524,DE05,DE02,20241227124946
1,3480,573,520,1,202451,6365,DE05,DE12,20241227124946
2,1899,573,4766,1,202451,6590,DE05,DE06,20241227124946



input_Planned_loads.csv - Sample Data After Cleaning:


Unnamed: 0,delivery_sk,material_sk,actual_quantity,unit_of_measure_code,plant_code,transportation_planning_priority,material_code,load_id,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,RFRC_NUM12,origin,destination,movement_type
0,81728721,1977870,4680.0,PC,GB01,1,99150,34718714,2025-04-10 08:00:00,2025-04-10 08:30:00,,,21820530,GB01,29585614,SO
1,81649724,1869080,26.0,PC,GB67,1,108741,34714811,2025-04-10 16:00:00,2025-04-10 17:00:00,,,21806479,GB67,29586301,SO
2,81697215,1978473,450.0,PC,GB02,1,99148,34716589,2025-04-10 15:00:00,2025-04-10 15:30:00,,,21816790,GB02,29586299,SO



input_Planned_production.csv - Sample Data After Cleaning:


Unnamed: 0,duration,end_outflow_ts,material_sk,material_code,plant_sk,plant_code,product_unit,quantity,start_inflow_ts,status
0,600.0,2025-04-10 09:00:00,49997,62275,482,GB01,PCE,39480.0,2025-04-09 23:00:00,processorder
1,116.0,2025-04-11 02:17:00,16116094,99787_RTL,85,BE02,HLT,580.0,2025-04-11 00:21:00,NO STATUS
2,1407.0,2025-04-12 05:27:00,1895487,104115,2566,DE02,PCE,30500.0,2025-04-11 06:00:00,processorder



input_stock.csv - Sample Data After Cleaning:


Unnamed: 0,material_code,material_sk,storage_location_code,plant_sk,plant_code,valuated_stock_hl,shelf_life_expiration_ts
0,66638,41950,4014,13688,IT12,9.0,2025-04-05 00:00:00
1,7587279,130491,1010,133,BE06,1.42,1900-01-01 00:00:00
2,66638,41950,4015,13688,IT12,21.6,2025-04-05 00:00:00



input_UOM_full.csv - Sample Data After Cleaning:


Unnamed: 0,material_code,material_sk,target_unit_of_measure_code,conversion_numerator,conversion_denumerator
0,185,92,PAL,70.0,1.0
1,80,10,PAL,8.0,1.0
2,26,6,PAL,70.0,1.0



input_UOM_weight.csv - Sample Data After Cleaning:


Unnamed: 0,material_code,pal_weight_kg
0,100298,1005.76
1,100584,936.0
2,100787,928.0


In [4]:
# Identify Missing Values in Each Dataset

print("Checking Missing Values in Important Columns...\n")

missing_summary = {}

for file, df in dataframes.items():
    missing_count = df.isnull().sum()  # Count missing values
    missing_percentage = (missing_count / len(df)) * 100 # Convert to percentage
    missing_percentage = missing_percentage.apply(lambda x: f"{x:.2f}%")
    
    # Filter columns with missing values
    missing_data = pd.DataFrame({'Missing_Count': missing_count, 'Missing_Percentage': missing_percentage})
    missing_data = missing_data[missing_data['Missing_Count'] > 0]
    
    if not missing_data.empty:
        missing_summary[file] = missing_data
        print(f"\n{file} - Missing Values Found:")
        display(missing_data)
    else:
        print(f"{file} - No Missing Values")

# Store summary for reference
missing_summary


Checking Missing Values in Important Columns...

input_Actual_production.csv - No Missing Values
input_Inventory_policy.csv - No Missing Values


input_lcp_data.csv - No Missing Values

input_Planned_loads.csv - Missing Values Found:


Unnamed: 0,Missing_Count,Missing_Percentage
destination_slot_arrival,2679,92.70%
destination_slot_departure,2679,92.70%
RFRC_NUM12,185,6.40%


input_Planned_production.csv - No Missing Values

input_stock.csv - Missing Values Found:


Unnamed: 0,Missing_Count,Missing_Percentage
shelf_life_expiration_ts,24431,35.39%


input_UOM_full.csv - No Missing Values
input_UOM_weight.csv - No Missing Values


{'input_Planned_loads.csv':                             Missing_Count Missing_Percentage
 destination_slot_arrival             2679             92.70%
 destination_slot_departure           2679             92.70%
 RFRC_NUM12                            185              6.40%,
 'input_stock.csv':                           Missing_Count Missing_Percentage
 shelf_life_expiration_ts          24431             35.39%}

### Input Stock Preprocessing

In [5]:
# Load input_stock.csv and Select Required Columns

# Define the dataset name
file_name = "input_stock.csv"
df_stock = dataframes[file_name]
required_columns_stock = ["material_code", "material_sk", "storage_location_code", "plant_sk", "plant_code", "valuated_stock_hl","shelf_life_expiration_ts"]
df_stock = df_stock[required_columns_stock]

# Store updated DataFrame
dataframes[file_name] = df_stock

# Display confirmation
print(f"Loaded and retained required columns for {file_name}:")
print(list(df_stock.columns))


Loaded and retained required columns for input_stock.csv:


['material_code', 'material_sk', 'storage_location_code', 'plant_sk', 'plant_code', 'valuated_stock_hl', 'shelf_life_expiration_ts']


#### Data Type Validation

In [6]:
# Detect Data Type Mismatches in input_stock.csv

# Define expected data types
expected_dtypes = {
    "material_code": "Int64",
    "material_sk": "Int64",
    "plant_sk": "Int64",
    "plant_code": "object",
    "valuated_stock_hl": "float64",
    "shelf_life_expiration_ts": "datetime64[ns]"
}

# Convert 'shelf_life_expiration_ts' to datetime format
df_stock["shelf_life_expiration_ts"] = pd.to_datetime(df_stock["shelf_life_expiration_ts"], errors='coerce')

# Check actual dtypes
dtype_mismatches = {}

for col, expected_dtype in expected_dtypes.items():
    actual_dtype = df_stock[col].dtype
    if actual_dtype != expected_dtype:
        dtype_mismatches[col] = (actual_dtype, expected_dtype)

# Display results
if dtype_mismatches:
    print("Data Type Mismatches Found:")
    for col, (actual, expected) in dtype_mismatches.items():
        print(f"   - Column: {col}, Found: {actual}, Expected: {expected}")
else:
    print("All Columns Have Correct Data Types.")


Data Type Mismatches Found:
   - Column: material_code, Found: object, Expected: Int64
   - Column: material_sk, Found: object, Expected: Int64
   - Column: plant_sk, Found: object, Expected: Int64
   - Column: valuated_stock_hl, Found: object, Expected: float64


#### Data type mismatches are handled

In [7]:
# Handle Data Type Mismatches in input_stock.csv

if dtype_mismatches:
    # Convert columns to correct types
    for col, (_, expected_dtype) in dtype_mismatches.items():
        df_stock[col] = df_stock[col].astype(expected_dtype, errors="ignore")
    
    print("Data Types Fixed.")

# Store updated DataFrame
dataframes[file_name] = df_stock


Data Types Fixed.


#### Filtering based on Expiration date

In [8]:
from datetime import datetime

# Step: Detect Expired Stock Before Filtering
print("Detecting Expired Stock in input_stock.csv")

# Convert expiration date column to datetime format
df_stock['shelf_life_expiration_ts'] = pd.to_datetime(df_stock['shelf_life_expiration_ts'], errors='coerce')

# Get the current run date (assuming today)
run_date = datetime.today()

# Count expired stock
expired_stock_count = (df_stock['shelf_life_expiration_ts'] < run_date).sum()
expired_stock_percentage = (expired_stock_count / len(df_stock)) * 100

print(f"Expired Stock Found: {expired_stock_count} rows ({expired_stock_percentage:.2f}%)")


Detecting Expired Stock in input_stock.csv
Expired Stock Found: 15068 rows (21.83%)


In [9]:
from datetime import datetime

# Convert expiration date column to datetime format
df_stock['shelf_life_expiration_ts'] = pd.to_datetime(df_stock['shelf_life_expiration_ts'], errors='coerce')

# Get the current run date (assuming today)
run_date = datetime.today()

# Filter out expired stock and update df_stock directly
df_stock = df_stock[df_stock['shelf_life_expiration_ts'] >= run_date]

print(f"After filtering non-expired stock: {len(df_stock)} rows kept.")


After filtering non-expired stock: 28087 rows kept.


#### Missing Values

In [10]:
# Detect Missing Values in input_stock.csv

# Check for missing values
missing_count = df_stock.isnull().sum()
missing_percentage = (missing_count / len(df_stock)) * 100  
missing_data = pd.DataFrame({'Missing_Count': missing_count, 'Missing_Percentage': missing_percentage})
missing_data = missing_data[missing_data['Missing_Count'] > 0]

# Display results
if not missing_data.empty:
    print(f"\nMissing Values Found in {file_name}:\n")
    missing_data["Missing_Percentage"] = missing_data["Missing_Percentage"].apply(lambda x: f"{x:.2f}%")
    
    display(missing_data)
else:
    print(f"\nNo Missing Values Found in {file_name}.")



No Missing Values Found in input_stock.csv.




#### Duplicates

In [11]:
# Detect Duplicates in input_stock.csv

# Check for duplicate rows
duplicate_count = df_stock.duplicated().sum()
duplicate_percentage = (duplicate_count / len(df_stock)) * 100  

# Display results
print(f"Duplicate Rows in {file_name}: {duplicate_count} ({duplicate_percentage:.2f}%)")


Duplicate Rows in input_stock.csv: 551 (1.96%)


#### Duplicates grouping Material_sk and Plant_sk

In [12]:
# Detect Duplicates Based on (plant_sk, material_sk)
duplicate_stock = df_stock.duplicated(subset=['plant_sk', 'material_sk'], keep=False)

# Count duplicate rows
duplicate_count = duplicate_stock.sum()
duplicate_percentage = (duplicate_count / len(df_stock)) * 100

print(f"Duplicate (plant_sk, material_sk) pairs found in input_stock.csv: {duplicate_count} ({duplicate_percentage:.2f}%)")

# Display duplicate records if found
if duplicate_count > 0:
    display(df_stock[duplicate_stock].sort_values(by=['plant_sk', 'material_sk']))
else:
    print("No duplicate (plant_sk, material_sk) pairs found.")


Duplicate (plant_sk, material_sk) pairs found in input_stock.csv: 20211 (71.96%)


Unnamed: 0,material_code,material_sk,storage_location_code,plant_sk,plant_code,valuated_stock_hl,shelf_life_expiration_ts
16626,3410,745,1000,15,AT02,21.6000,2026-02-10
18261,3410,745,1000,15,AT02,24.0000,2026-03-05
33333,3410,745,4003,15,AT02,38.4000,2025-04-17
68220,3410,745,4000,15,AT02,0.3000,2025-12-04
2962,3493,1667,1000,15,AT02,68.0000,2026-03-31
...,...,...,...,...,...,...,...
68891,99154,1978110,1010,16202,IE06,28.5120,2026-01-31
65759,99661,2043526,1010,16202,IE06,5.9400,2026-06-30
65770,99661,2043526,1010,16202,IE06,5.9400,2026-06-30
68758,99662,2043586,1010,16202,IE06,5.7024,2026-09-30


##### These duplicates caused by differnt valuated_stock_hl

In [13]:
# Analyze Duplicate Stock Entries

# Group by plant_sk and material_sk to check stock variations
stock_variation = df_stock[df_stock.duplicated(subset=['plant_sk', 'material_sk'], keep=False)] \
    .groupby(['plant_sk', 'material_sk'])['valuated_stock_hl'].nunique().reset_index()

# Count cases where the stock values differ
stock_value_mismatch = stock_variation[stock_variation['valuated_stock_hl'] > 1]

# Display summary
print(f"Stock value differences found in {len(stock_value_mismatch)} (plant_sk, material_sk) groups.")

# Show a few mismatches for analysis
display(stock_value_mismatch.head(10))


Stock value differences found in 2778 (plant_sk, material_sk) groups.




Unnamed: 0,plant_sk,material_sk,valuated_stock_hl
0,15,745,4
1,15,1667,16
2,15,1797,2
3,15,2130,3
5,15,3448,5
6,15,4802,2
7,15,4977,4
8,15,5091,9
10,15,7264,3
11,15,9937,3


#### Handling these duplicates by aggregating based on code and sk and saving to Stock.csv

In [14]:
import os

# Group by relevant keys and sum opening stock
df_opening_stock = df_stock.groupby(
    ["material_code", "material_sk", "plant_sk", "plant_code"], as_index=False
)["valuated_stock_hl"].sum()

# Rename column for clarity
df_opening_stock.rename(columns={"valuated_stock_hl": "Opening_Stock_hl"}, inplace=True)

# Define the path to save
stock_file_path = os.path.join(preprocessed_path, "stock.csv")

# Save to CSV
df_opening_stock.to_csv(stock_file_path, index=False)

print(f"Opening Stock aggregated and saved to {stock_file_path}")


Opening Stock aggregated and saved to ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/stock.csv


In [15]:
df_opening_stock.describe()

Unnamed: 0,material_code,material_sk,plant_sk,Opening_Stock_hl
count,10692.0,10692.0,10692.0,10692.0
mean,8277140.708941,1093245.112888,2689.199682,316.040139
std,70032311.485315,2874738.748599,3578.828733,1796.564605
min,3337.0,685.0,15.0,0.0035
25%,75952.25,47934.75,1002.0,1.305
50%,105193.0,87469.0,1810.0,7.44
75%,7545058.0,979021.0,2641.0,68.400607
max,999009541.0,17356819.0,16202.0,79400.816


#### Aggregation Verification

In [16]:
# Verify aggregation consistency
check_aggregation = df_stock.groupby(["material_code", "material_sk", "plant_sk", "plant_code"])["valuated_stock_hl"].sum()

# Load the saved stock.csv
df_saved_stock = pd.read_csv(stock_file_path)

# Compare summed values
mismatch = (df_saved_stock.set_index(["material_code", "material_sk", "plant_sk", "plant_code"])["Opening_Stock_hl"] - check_aggregation).abs().sum()

if mismatch == 0:
    print("Aggregated stock values match perfectly with stock.csv!")
else:
    print(f"Mismatch detected in summed values: {mismatch}")


Mismatch detected in summed values: 2.798805631698542e-11


#### Stock at Risk based on expiration date

In [17]:
from datetime import datetime, timedelta

# Define stock-at-risk threshold (Run Date + 60 Days)
stock_risk_threshold = run_date + timedelta(days=60)

# Detect stock at risk
at_risk_stock = df_stock[df_stock['shelf_life_expiration_ts'] < stock_risk_threshold]

# Calculate percentage
at_risk_count = len(at_risk_stock)
at_risk_percentage = (at_risk_count / len(df_stock)) * 100

print(f"Stock at Risk Found: {at_risk_count} rows ({at_risk_percentage:.2f}%)")


Stock at Risk Found: 1416 rows (5.04%)


##### Even though we filtered out based on expiration date, still there are stock which are about to expire in next 60 days 

#### Storing stock at risk to obs_df.csv

In [18]:
import os

# Aggregate stock at risk
df_obs_stock = at_risk_stock.groupby(
    ["material_code", "plant_code", "material_sk", "plant_sk"], as_index=False
)["valuated_stock_hl"].sum()

# Rename column for clarity
df_obs_stock.rename(columns={"valuated_stock_hl": "Stock_At_Risk_hl"}, inplace=True)

# Define file path & save
obs_file_path = os.path.join(preprocessed_path, "obs_df.csv")
df_obs_stock.to_csv(obs_file_path, index=False)

print(f"Obsolescence Data aggregated and saved to {obs_file_path}")


Obsolescence Data aggregated and saved to ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/obs_df.csv




### Input Inventory Policy Preprocessing 

In [19]:
#loading the file 
file_name = "input_Inventory_policy.csv"
df_inventory_policy = dataframes[file_name]

# Important columns
required_columns_inventory = ["plant_sk","material_sk", "plant_code", "material_code","target_opening_stock","maximum_stock"]
df_inventory_policy = df_inventory_policy[required_columns_inventory]
dataframes[file_name] = df_inventory_policy

# display results 
display(df_inventory_policy.head(10))
print(list(df_inventory_policy.columns))

Unnamed: 0,plant_sk,material_sk,plant_code,material_code,target_opening_stock,maximum_stock
0,1941,1876258,NL03,107494,300.0,810.210226
1,2566,3480,DE02,6365,47.968987,240.031882
2,2641,59933,NL08,94885,0.414761,1.121469
3,1975,3649,BE07,11385,419.86467,1533.959265
4,1130,4764,NL17,17495,0.700671,3.35605
5,573,64192,DE05,83748,1.7985,2.644726
6,1941,6437,NL03,11606,1621.291513,8143.826021
7,3712,813,LU05,3372,1.012,3.462858
8,133,16450,BE06,34808,2.2248,2.675128
9,1332,52369,BE11,91708,0.0,10.105715


['plant_sk', 'material_sk', 'plant_code', 'material_code', 'target_opening_stock', 'maximum_stock']


#### Missing values

In [20]:
# missing values in inventory policy
missing_count = df_inventory_policy.isnull().sum()
missing_percentage = (missing_count/len(df_inventory_policy))*100

# displaying the results
missing_data = pd.DataFrame({"missing_count":missing_count, "missing_percentage":missing_percentage})
missing_data = missing_data[missing_data['missing_count'] > 0]

if not missing_data.empty:
    print(f"\nMissing Values Found in {file_name}:\n")
    
    missing_data["Missing_Percentage"] = missing_data["Missing_Percentage"].apply(lambda x: f"{x:.2f}%")
    
    display(missing_data)
else:
    print(f"\nNo Missing Values Found in {file_name}")


No Missing Values Found in input_Inventory_policy.csv


#### Duplicates

In [21]:
# Step 3: Detect Duplicates in input_Inventory_policy.csv

# Identify exact duplicate rows
duplicate_count = df_inventory_policy.duplicated().sum()
duplicate_percentage = (duplicate_count / len(df_inventory_policy)) * 100  

print(f"Duplicate Rows in {file_name}: {duplicate_count} ({duplicate_percentage:.2f}%)")

# Display duplicate records if found
if duplicate_count > 0:
    display(df_inventory_policy[df_inventory_policy.duplicated(keep=False)].sort_values(by=df_inventory_policy.columns.tolist()))
else:
    print("No duplicate rows found.")


Duplicate Rows in input_Inventory_policy.csv: 0 (0.00%)
No duplicate rows found.


#### Data Type Validation

In [22]:
# Data Type Validation for input_Inventory_policy.csv

# Define expected data types
expected_dtypes = {
    "plant_sk": "Int64",
    "material_sk": "Int64",
    "plant_code": "object",
    "material_code": "Int64",
    "target_opening_stock": "float64",
    "maximum_stock": "float64"
}

# Check actual dtypes
dtype_mismatches = {}

for col, expected_dtype in expected_dtypes.items():
    actual_dtype = df_inventory_policy[col].dtype
    if actual_dtype != expected_dtype:
        dtype_mismatches[col] = (actual_dtype, expected_dtype)

# Display results
if dtype_mismatches:
    print("Data Type Mismatches Found:")
    for col, (actual, expected) in dtype_mismatches.items():
        print(f"   - Column: {col}, Found: {actual}, Expected: {expected}")

    # Convert to correct types
    df_inventory_policy = df_inventory_policy.astype(expected_dtypes)
    print("Data Types Fixed.")
else:
    print("All Columns Have Correct Data Types.")

# Store updated DataFrame
dataframes["input_Inventory_policy.csv"] = df_inventory_policy


Data Type Mismatches Found:
   - Column: plant_sk, Found: object, Expected: Int64
   - Column: material_sk, Found: object, Expected: Int64
   - Column: material_code, Found: object, Expected: Int64
   - Column: target_opening_stock, Found: object, Expected: float64
   - Column: maximum_stock, Found: object, Expected: float64
Data Types Fixed.


#### Non Negative 

In [23]:
# Detect Negative Values in input_Inventory_policy.csv

# Check for negative values in relevant columns
negative_values = df_inventory_policy[
    (df_inventory_policy["target_opening_stock"] <= 0) |
    (df_inventory_policy["maximum_stock"] <= 0)
]

# Display results
if not negative_values.empty:
    negative_count = len(negative_values)
    negative_percentage = (negative_count / len(df_inventory_policy)) * 100
    print(f"Found {negative_count} ({negative_percentage:.2f}%) rows with negative values in target_opening_stock or maximum_stock.")
    
    # Show sample
    display(negative_values.head(10))
else:
    print("No Negative Values Found in target_opening_stock or maximum_stock.")


Found 1525 (28.90%) rows with negative values in target_opening_stock or maximum_stock.


Unnamed: 0,plant_sk,material_sk,plant_code,material_code,target_opening_stock,maximum_stock
9,1332,52369,BE11,91708,0.0,10.105715
13,2566,1296,DE02,3452,0.0,7814.1822
14,62,27955,BE22,48938,0.0,185.351397
15,2566,64393,DE02,92984,0.0,83.2139
23,85,2272,BE02,10093,0.0,233.233796
29,142,1547,BE04,3411,0.0,997.995993
30,2857,7751,NL02,18694,0.0,1741.505527
35,573,48958,DE05,91339,1084.3894,0.0
38,4766,12210,DE06,21410,0.0,12.792444
39,3262,49567,NL14,94864,0.0,2.891247


#### Remove rows where maximum_stock is <= 0

In [24]:
# Remove rows where maximum_stock is 0
initial_count = len(df_inventory_policy)
df_inventory_policy = df_inventory_policy[df_inventory_policy["maximum_stock"] > 0]

# Count and calculate percentage of removed rows
removed_rows = initial_count - len(df_inventory_policy)
removed_percentage = (removed_rows / initial_count) * 100

# Display results
print(f"Removed {removed_rows} rows ({removed_percentage:.2f}%) where maximum_stock was 0.")
print(f"Rows remaining after removal: {len(df_inventory_policy)}")


Removed 265 rows (5.02%) where maximum_stock was 0.
Rows remaining after removal: 5012


#### Stock constraint target_opening_stock < maximum_stock

In [25]:
# Detect where maximum_stock is less than target_opening_stock
invalid_stock = df_inventory_policy[df_inventory_policy["maximum_stock"] < df_inventory_policy["target_opening_stock"]]

# Count and calculate percentage
invalid_count = len(invalid_stock)
invalid_percentage = (invalid_count / len(df_inventory_policy)) * 100

# Display results
if invalid_count > 0:
    print(f"Found {invalid_count} rows ({invalid_percentage:.2f}%) where maximum_stock is less than target_opening_stock.")
    display(invalid_stock)
else:
    print("All rows satisfy the constraint: maximum_stock >= target_opening_stock.")


Found 8 rows (0.16%) where maximum_stock is less than target_opening_stock.


Unnamed: 0,plant_sk,material_sk,plant_code,material_code,target_opening_stock,maximum_stock
256,753,1889576,GB67,107725,9.871,7.127129
482,3262,59933,NL14,94885,1.505186,1.046799
2140,16105,36225,FR11,65716,112.116659,42.110773
3740,13670,36225,FR10,65716,61.443176,14.936489
4267,1941,30431,NL03,58458,17.73564,14.857732
4324,2373,1878588,IT11,102960,20.5004,18.357442
4424,4072,40562,BE37,59873,106.37892,87.659894
4781,753,1874873,GB67,104192,87.0389,73.893857


#### Filtering invalid rows based on stock constraint

In [26]:
# Remove rows violating the constraint
initial_count = len(df_inventory_policy)
df_inventory_policy = df_inventory_policy[df_inventory_policy["maximum_stock"] >= df_inventory_policy["target_opening_stock"]]

# Count and calculate percentage of removed rows
removed_rows = initial_count - len(df_inventory_policy)
removed_percentage = (removed_rows / initial_count) * 100

# Display results
print(f"Removed {removed_rows} rows ({removed_percentage:.2f}%) where maximum_stock was less than target_opening_stock.")
print(f"Final Rows Remaining: {len(df_inventory_policy)}")


Removed 8 rows (0.16%) where maximum_stock was less than target_opening_stock.
Final Rows Remaining: 5004


#### Filtering rows based on specific plant codes 

In [27]:
# Define GB plant codes to retain
gb_plants = {"GB01", "GB02", "GB06", "GB28", "GB43", "GB50", "GB54", "GB67", "GB80", "GB81", "GB71"}

# Count before filtering
initial_count = len(df_inventory_policy)

# Apply filter
df_inventory_policy = df_inventory_policy[
    (df_inventory_policy["plant_code"].isin(gb_plants)) | 
    (~df_inventory_policy["plant_code"].str.startswith("GB"))
]

# Count and calculate percentage of removed rows
removed_rows = initial_count - len(df_inventory_policy)
removed_percentage = (removed_rows / initial_count) * 100

# Display results
print(f"Removed {removed_rows} rows ({removed_percentage:.2f}%) that did not match the GB plant filtering rule.")
print(f"Remaining Rows after filtering: {len(df_inventory_policy)}")


Removed 2 rows (0.04%) that did not match the GB plant filtering rule.
Remaining Rows after filtering: 5002


#### Final saving of preprocessed data to inventory_policy.csv

In [28]:
# Define the path to save
inventory_policy_path = os.path.join(preprocessed_path, "inventory_policy.csv")

# Save to CSV
df_inventory_policy.to_csv(inventory_policy_path, index=False)

print(f"Inventory Policy preprocessed and saved to {inventory_policy_path}")


Inventory Policy preprocessed and saved to ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/inventory_policy.csv


### Input LCP Data Preprocessing

In [29]:
#loading the file 
file_name = "input_lcp_data.csv"
df_lcp_data = dataframes[file_name]

# Important columns
required_columns_lcp = ["material_sk", "lcp_rank", "material_code", "origin_location_code", "destination_location_code","release_week_id","ins_gmt_ts","origin_plant_sk","destination_plant_sk"]
df_lcp_data = df_lcp_data[required_columns_lcp]
dataframes[file_name] = df_lcp_data

# display results 
display(df_lcp_data.head(10))
print(list(df_lcp_data.columns))

Unnamed: 0,material_sk,lcp_rank,material_code,origin_location_code,destination_location_code,release_week_id,ins_gmt_ts,origin_plant_sk,destination_plant_sk
0,3769,2,11524,DE05,DE02,202451,20241227124946,573,2566
1,3480,1,6365,DE05,DE12,202451,20241227124946,573,520
2,1899,1,6590,DE05,DE06,202451,20241227124946,573,4766
3,2241,1,5733,DE05,DE02,202451,20241227124946,573,2566
4,3448,1,6490,DE46,DE06,202451,20241227124946,732,4766
5,1286,2,3337,BE50,BE22,202451,20241227124946,50,62
6,1884,1,5767,DE13,DE02,202451,20241227124946,1402,2566
7,813,3,3372,BE50,IT11,202451,20241227124946,50,2373
8,1233,1,3650,NL03,NL15,202451,20241227124946,1941,1002
9,2110,2,5716,DE05,DE12,202451,20241227124946,573,520


['material_sk', 'lcp_rank', 'material_code', 'origin_location_code', 'destination_location_code', 'release_week_id', 'ins_gmt_ts', 'origin_plant_sk', 'destination_plant_sk']


#### Missing Values

In [30]:
# Checking for missing values in important columns
#required_columns_lcp = ["material_sk", "lcp_rank", "material_code", "origin_location_code", "destination_location_code"]

missing_values = df_lcp_data[required_columns_lcp].isnull().sum()

print("### Missing Values Check ###")
print(missing_values)

# Display rows with missing values if any
missing_rows = df_lcp_data[df_lcp_data[required_columns_lcp].isnull().any(axis=1)]
if not missing_rows.empty:
    print("\nRows with missing values:")
    display(missing_rows)
else:
    print("\nNo missing values found in important columns.")


### Missing Values Check ###
material_sk                  0
lcp_rank                     0
material_code                0
origin_location_code         0
destination_location_code    0
release_week_id              0
ins_gmt_ts                   0
origin_plant_sk              0
destination_plant_sk         0
dtype: int64

No missing values found in important columns.


#### Data Type Validation

In [31]:
# Checking if each column has the expected data type
expected_dtypes = {
    "material_sk": "int64",
    "lcp_rank": "int64",
    "material_code": "int64",
    "origin_location_code": "object",
    "destination_location_code": "object",
    "origin_plant_sk" : "int64",
    "destination_plant_sk" : "int64"
}

# Identifying mismatched data types
mismatched_types = {
    col: df_lcp_data[col].dtype
    for col in expected_dtypes
    if df_lcp_data[col].dtype != expected_dtypes[col]
}

if mismatched_types:
    print("Columns with unexpected data types:")
    for col, dtype in mismatched_types.items():
        print(f"- {col}: Expected {expected_dtypes[col]}, Found {dtype}")
else:
    print("All important columns have the correct data types.")



Columns with unexpected data types:
- material_sk: Expected int64, Found object
- lcp_rank: Expected int64, Found object
- material_code: Expected int64, Found object
- origin_plant_sk: Expected int64, Found object
- destination_plant_sk: Expected int64, Found object


#### Handling data mismatches

In [32]:
# Convert incorrect data types
df_lcp_data["material_sk"] = pd.to_numeric(df_lcp_data["material_sk"], errors="coerce")
df_lcp_data["lcp_rank"] = pd.to_numeric(df_lcp_data["lcp_rank"], errors="coerce")
df_lcp_data["material_code"] = pd.to_numeric(df_lcp_data["material_code"], errors="coerce")
df_lcp_data["origin_plant_sk"] = pd.to_numeric(df_lcp_data["origin_plant_sk"], errors="coerce")
df_lcp_data["destination_plant_sk"] = pd.to_numeric(df_lcp_data["destination_plant_sk"], errors="coerce")

# Validate conversion
print("Data Types Fixed.")


Data Types Fixed.


#### Non - Negative

In [33]:
# Checking for negative or zero values in lcp_rank
invalid_lcp_rank = df_lcp_data[df_lcp_data["lcp_rank"] <= 0]

if not invalid_lcp_rank.empty:
    print("Found rows with zero or negative values in lcp_rank:")
    display(invalid_lcp_rank)
else:
    print("All values in lcp_rank are positive.")


All values in lcp_rank are positive.


#### Duplicates

In [34]:
# Checking for duplicate rows in lcp_data based on key columns
initial_count_lcp = len(df_lcp_data)

# Identify duplicates based on key columns
duplicates_lcp = df_lcp_data.duplicated(subset=["material_sk", "material_code", "origin_location_code", "destination_location_code"], keep=False)

# Count and calculate percentage
duplicate_count = duplicates_lcp.sum()
duplicate_percentage = (duplicate_count / initial_count_lcp) * 100  

if duplicate_count > 0:
    print(f"Found {duplicate_count} ({duplicate_percentage:.2f}%) duplicate rows in lcp_data.")
    display(df_lcp_data[duplicates_lcp].head(10))  # Displaying a sample
else:
    print("No duplicates found in lcp_data.")


Found 79717 (98.54%) duplicate rows in lcp_data.


Unnamed: 0,material_sk,lcp_rank,material_code,origin_location_code,destination_location_code,release_week_id,ins_gmt_ts,origin_plant_sk,destination_plant_sk
0,3769,2,11524,DE05,DE02,202451,20241227124946,573,2566
1,3480,1,6365,DE05,DE12,202451,20241227124946,573,520
2,1899,1,6590,DE05,DE06,202451,20241227124946,573,4766
3,2241,1,5733,DE05,DE02,202451,20241227124946,573,2566
4,3448,1,6490,DE46,DE06,202451,20241227124946,732,4766
5,1286,2,3337,BE50,BE22,202451,20241227124946,50,62
6,1884,1,5767,DE13,DE02,202451,20241227124946,1402,2566
7,813,3,3372,BE50,IT11,202451,20241227124946,50,2373
8,1233,1,3650,NL03,NL15,202451,20241227124946,1941,1002
9,2110,2,5716,DE05,DE12,202451,20241227124946,573,520


#### LCP Rank Validation

In [35]:
# Checking if the same material, origin, destination has multiple lcp_ranks in the same release_week_id
initial_count_lcp = len(df_lcp_data)

# Identify duplicates based on material, origin, destination, and release_week_id
multiple_lcp_per_week = df_lcp_data.duplicated(
    subset=["material_sk", "origin_location_code", "destination_location_code", "release_week_id","origin_plant_sk","destination_plant_sk"], keep=False
)

# Count and calculate percentage
multiple_lcp_count = multiple_lcp_per_week.sum()
multiple_lcp_percentage = (multiple_lcp_count / initial_count_lcp) * 100  

if multiple_lcp_count > 0:
    print(f"Found {multiple_lcp_count} ({multiple_lcp_percentage:.2f}%) rows where the same material, origin, and destination have multiple lcp_ranks in the same release_week_id.")
    
    # Display sample data
    display(df_lcp_data[multiple_lcp_per_week].sort_values(
        ["material_sk", "origin_location_code", "destination_location_code","origin_plant_sk","destination_plant_sk", "release_week_id"]
    ).head(10))  # Displaying a sample of 10 rows
else:
    print("No multiple lcp_rank values found for the same material and locations in the same release_week_id.")


Found 40043 (49.50%) rows where the same material, origin, and destination have multiple lcp_ranks in the same release_week_id.


Unnamed: 0,material_sk,lcp_rank,material_code,origin_location_code,destination_location_code,release_week_id,ins_gmt_ts,origin_plant_sk,destination_plant_sk
3889,685,1,3393,BE02,BE03,202508,20250221042329,85,73
4789,685,1,3393,BE02,BE03,202508,20250311074312,85,73
6589,685,1,3393,BE02,BE03,202512,20250319080954,85,73
7489,685,1,3393,BE02,BE03,202512,20250320133739,85,73
8389,685,1,3393,BE02,BE03,202512,20250321062306,85,73
4069,685,1,3393,BE02,BE04,202508,20250221042329,85,142
4969,685,1,3393,BE02,BE04,202508,20250311074312,85,142
6769,685,1,3393,BE02,BE04,202512,20250319080954,85,142
7669,685,1,3393,BE02,BE04,202512,20250320133739,85,142
8569,685,1,3393,BE02,BE04,202512,20250321062306,85,142


#### Handling LCP rank by considering latest entries

In [36]:
# Store initial row count before removing duplicates
initial_rows_lcp = len(df_lcp_data)

# Convert ins_gmt_ts to numeric format for sorting (since it's stored as a string)
df_lcp_data["ins_gmt_ts"] = pd.to_numeric(df_lcp_data["ins_gmt_ts"], errors="coerce")

# Sorting by material_sk, origin_location_code, destination_location_code, release_week_id and latest timestamp
df_lcp_data = df_lcp_data.sort_values(["material_sk", "origin_location_code", "destination_location_code","origin_plant_sk","destination_plant_sk", "release_week_id", "ins_gmt_ts"], ascending=[True, True, True, True,True,True, False])

# Dropping duplicates while keeping the latest entry based on ins_gmt_ts
df_lcp_data = df_lcp_data.drop_duplicates(subset=["material_sk", "origin_location_code", "destination_location_code", "origin_plant_sk", "destination_plant_sk", "release_week_id"], keep="first")

# Store final row count after removing duplicates
final_rows_lcp = len(df_lcp_data)

# Calculate removed rows and percentage
rows_removed = initial_rows_lcp - final_rows_lcp
percentage_removed = (rows_removed / initial_rows_lcp) * 100

# Display results
print(f"Duplicate entries removed: {rows_removed} rows ({percentage_removed:.2f}%).")
print(f"Final Rows Remaining: {final_rows_lcp}")


Duplicate entries removed: 23805 rows (29.43%).
Final Rows Remaining: 57093


#### Final preprocessed data saved to lcp_data

In [37]:
import os

# Define the file path for saving
lcp_file_path = os.path.join(preprocessed_path, "lcp_data.csv")

# Save the processed data
df_lcp_data.to_csv(lcp_file_path, index=False)

print(f"Processed LCP data saved to: {lcp_file_path}")


Processed LCP data saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/lcp_data.csv


### Input UOM Full Preprocessing

In [38]:
# Load input_UOM_full.csv and Select Required Columns

# Define the dataset name
file_name = "input_UOM_full.csv"
df_uom_full = dataframes[file_name]

# Define required columns
required_columns_uom = ["material_code", "material_sk", "target_unit_of_measure_code", "conversion_numerator", "conversion_denumerator"]

# Retain only required columns
df_uom_full = df_uom_full[required_columns_uom]

# Store updated DataFrame
dataframes[file_name] = df_uom_full

# Display confirmation
print(f"Loaded and retained required columns for {file_name}:")
print(list(df_uom_full.columns))

# Display first few rows
display(df_uom_full.head())


Loaded and retained required columns for input_UOM_full.csv:
['material_code', 'material_sk', 'target_unit_of_measure_code', 'conversion_numerator', 'conversion_denumerator']


Unnamed: 0,material_code,material_sk,target_unit_of_measure_code,conversion_numerator,conversion_denumerator
0,185,92,PAL,70.0,1.0
1,80,10,PAL,8.0,1.0
2,26,6,PAL,70.0,1.0
3,73,128,HL,5.0,1.0
4,18,13,HL,25.0,3.0


#### Detecting unique measure code

In [39]:
# Detect if any values other than 'HL' and 'PAL' exist

# Get unique values
unique_uom = df_uom_full["target_unit_of_measure_code"].unique()

print(f"Unique Values in target_unit_of_measure_code: {unique_uom}")

# Identify invalid values
valid_uoms = ["HL", "PAL"]
invalid_uoms = df_uom_full[~df_uom_full["target_unit_of_measure_code"].isin(valid_uoms)]

# Display results
if not invalid_uoms.empty:
    print(f"Found {len(invalid_uoms)} rows with invalid target_unit_of_measure_code:")
    display(invalid_uoms["target_unit_of_measure_code"].value_counts())
else:
    print("All values in target_unit_of_measure_code are either 'HL' or 'PAL'.")


Unique Values in target_unit_of_measure_code: ['PAL' 'HL']
All values in target_unit_of_measure_code are either 'HL' or 'PAL'.


#### Remove rows where unit measure code is not 'HL' and 'PAL'

In [40]:
# Remove rows where target_unit_of_measure_code is NOT 'HL' or 'PAL'

# Count before filtering
initial_rows = len(df_uom_full)

# Apply filter
df_uom_full = df_uom_full[df_uom_full["target_unit_of_measure_code"].isin(["HL", "PAL"])]

# Count after filtering
filtered_rows = len(df_uom_full)

# Calculate percentage of rows retained
retained_percentage = (filtered_rows / initial_rows) * 100

print(f"Filter Applied: Kept {filtered_rows} rows ({retained_percentage:.2f}%) out of {initial_rows}.")


Filter Applied: Kept 126813 rows (100.00%) out of 126813.


#### Missing Values

In [41]:
# Detect Missing Values in input_UOM_full.csv

# Count missing values
missing_count = df_uom_full.isnull().sum()
missing_percentage = (missing_count / len(df_uom_full)) * 100  

# Filter only columns with missing values
missing_data = pd.DataFrame({'Missing_Count': missing_count, 'Missing_Percentage': missing_percentage})
missing_data = missing_data[missing_data['Missing_Count'] > 0]

if not missing_data.empty:
    print("\nMissing Values Found in input_UOM_full.csv:\n")
    missing_data["Missing_Percentage"] = missing_data["Missing_Percentage"].apply(lambda x: f"{x:.2f}%")
    display(missing_data)
else:
    print("\nNo Missing Values Found in input_uom_full.csv.")



No Missing Values Found in input_uom_full.csv.


#### Detecting Duplicates

In [42]:
# Detect Duplicate Rows in input_UOM_full.csv

# Check for duplicates based on key columns
duplicate_mask = df_uom_full.duplicated(subset=["material_sk", "material_code", "target_unit_of_measure_code"], keep=False)

# Count duplicate rows
duplicate_count = duplicate_mask.sum()
duplicate_percentage = (duplicate_count / len(df_uom_full)) * 100

print(f"\nDuplicate Rows in input_uom_full.csv: {duplicate_count} ({duplicate_percentage:.2f}%)")

# Display duplicate records if found
if duplicate_count > 0:
    display(df_uom_full[duplicate_mask].sort_values(by=["material_sk", "material_code", "target_unit_of_measure_code"]))
else:
    print("\nNo Duplicate Rows Found in input_uom_full.csv")



Duplicate Rows in input_uom_full.csv: 0 (0.00%)

No Duplicate Rows Found in input_uom_full.csv


#### Data Type Validation

In [43]:
# Data Type Validation for input_UOM_full.csv

# Define expected data types
expected_dtypes = {
    "material_code": "Int64",
    "material_sk": "Int64",
    "target_unit_of_measure_code": "object",
    "conversion_numerator": "float64",
    "conversion_denumerator": "float64"
}

# Identify mismatches
dtype_mismatches = {
    col: df_uom_full[col].dtype for col in expected_dtypes if df_uom_full[col].dtype != expected_dtypes[col]
}

# Display results
if dtype_mismatches:
    print("Data Type Mismatches Found:")
    for col, dtype in dtype_mismatches.items():
        print(f"   - Column: {col}, Found: {dtype}, Expected: {expected_dtypes[col]}")
else:
    print("All Columns Have Correct Data Types.")


Data Type Mismatches Found:
   - Column: material_code, Found: object, Expected: Int64
   - Column: material_sk, Found: object, Expected: Int64
   - Column: conversion_numerator, Found: object, Expected: float64
   - Column: conversion_denumerator, Found: object, Expected: float64


#### Handling data type mismatches

In [44]:
# Fixing Data Type Mismatches for input_UOM_full.csv

# Convert columns to correct types
df_uom_full["material_code"] = pd.to_numeric(df_uom_full["material_code"], errors="coerce")
df_uom_full["material_sk"] = pd.to_numeric(df_uom_full["material_sk"], errors="coerce")
df_uom_full["conversion_numerator"] = pd.to_numeric(df_uom_full["conversion_numerator"], errors="coerce")
df_uom_full["conversion_denumerator"] = pd.to_numeric(df_uom_full["conversion_denumerator"], errors="coerce")

# Store updated DataFrame
dataframes["input_uom_full.csv"] = df_uom_full

print("Data Types Fixed.")


Data Types Fixed.


#### Negative values

In [45]:
# Detect Negative Values in input_UOM_full.csv

# Check for negative values in relevant columns
negative_values = df_uom_full[
    (df_uom_full["conversion_numerator"] <= 0) | 
    (df_uom_full["conversion_denumerator"] <= 0)
]


if not negative_values.empty:
    negative_count = len(negative_values)
    negative_percentage = (negative_count / len(df_uom_full)) * 100
    print(f"Found {negative_count} ({negative_percentage:.2f}%) rows with negative values in conversion_numerator or conversion_denumerator.")
    
   
    display(negative_values.head(10))
else:
    print("No Negative Values Found in conversion_numerator or conversion_denumerator.")


No Negative Values Found in conversion_numerator or conversion_denumerator.


#### Detecting invalid values in conversion denumerator

In [46]:
# Detect Zero Values in conversion_denumerator

zero_denumerator = df_uom_full[df_uom_full["conversion_denumerator"] == 0]

# Display results
if not zero_denumerator.empty:
    zero_count = len(zero_denumerator)
    zero_percentage = (zero_count / len(df_uom_full)) * 100
    print(f"Found {zero_count} ({zero_percentage:.2f}%) rows where conversion_denumerator is zero.")
    
    # Show sample
    display(zero_denumerator.head(10))
else:
    print("No Zero Values Found in conversion_denumerator.")


No Zero Values Found in conversion_denumerator.


#### Final saving to uom_full.csv

In [47]:

uom_full_path = os.path.join(preprocessed_path, "uom_full.csv")

df_uom_full.to_csv(uom_full_path, index=False)

print(f"Processed UOM Full data saved to {uom_full_path}")


Processed UOM Full data saved to ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/uom_full.csv


### Input UOM Weight Preprocessing

In [48]:
# Define the dataset name
file_name = "input_UOM_weight.csv"
df_uom_weight = dataframes[file_name]

# Select only required columns
required_columns_weight = ["material_code", "pal_weight_kg"]
df_uom_weight = df_uom_weight[required_columns_weight]

# Store updated DataFrame
dataframes[file_name] = df_uom_weight

# Display confirmation
print(f"Loaded and retained required columns for {file_name}:")
print(list(df_uom_weight.columns))

# Display the first few rows
df_uom_weight.head()


Loaded and retained required columns for input_UOM_weight.csv:
['material_code', 'pal_weight_kg']


Unnamed: 0,material_code,pal_weight_kg
0,100298,1005.76
1,100584,936.0
2,100787,928.0
3,100036,770.2
4,100008,167.14


#### Missing Values

In [49]:
# Check for missing values
missing_count = df_uom_weight.isnull().sum()
missing_percentage = (missing_count / len(df_uom_weight)) * 100  

# Filter columns with missing values
missing_data = pd.DataFrame({'Missing_Count': missing_count, 'Missing_Percentage': missing_percentage})
missing_data = missing_data[missing_data['Missing_Count'] > 0]

if not missing_data.empty:
    print(f"\nMissing Values Found in {file_name}:\n")
    
    missing_data["Missing_Percentage"] = missing_data["Missing_Percentage"].apply(lambda x: f"{x:.2f}%")
    
    display(missing_data)
else:
    print(f"\nNo Missing Values Found in {file_name}.")



No Missing Values Found in input_UOM_weight.csv.


#### Duplicates

In [50]:
# Check for duplicate rows
duplicate_count = df_uom_weight.duplicated().sum()
duplicate_percentage = (duplicate_count / len(df_uom_weight)) * 100  

print(f"Duplicate Rows in {file_name}: {duplicate_count} ({duplicate_percentage:.2f}%)")

if duplicate_count > 0:
    display(df_uom_weight[df_uom_weight.duplicated()])
else:
    print(f"\nNo Duplicate Rows Found in {file_name}")


Duplicate Rows in input_UOM_weight.csv: 0 (0.00%)

No Duplicate Rows Found in input_UOM_weight.csv


#### Data Types Validation

In [51]:
# Define expected data types
expected_dtypes = {
    "material_code": "Int64",
    "pal_weight_kg": "float64"
}

# Check actual dtypes
dtype_mismatches = {}

for col, expected_dtype in expected_dtypes.items():
    actual_dtype = df_uom_weight[col].dtype
    if actual_dtype != expected_dtype:
        dtype_mismatches[col] = (actual_dtype, expected_dtype)

# Display results
if dtype_mismatches:
    print("Data Type Mismatches Found:")
    for col, (actual, expected) in dtype_mismatches.items():
        print(f"   - Column: {col}, Found: {actual}, Expected: {expected}")
else:
    print("All Columns Have Correct Data Types.")


Data Type Mismatches Found:
   - Column: material_code, Found: object, Expected: Int64
   - Column: pal_weight_kg, Found: object, Expected: float64


#### Handling Data Type Mismatches

In [52]:
# Convert data types
df_uom_weight["material_code"] = pd.to_numeric(df_uom_weight["material_code"], errors="coerce")
df_uom_weight["pal_weight_kg"] = pd.to_numeric(df_uom_weight["pal_weight_kg"], errors="coerce")

# Confirm Fix
print("Data Types Fixed.")


Data Types Fixed.


In [53]:
# Check for negative or zero values in pal_weight_kg
invalid_weights = df_uom_weight[df_uom_weight["pal_weight_kg"] <= 0]

if not invalid_weights.empty:
    invalid_count = len(invalid_weights)
    invalid_percentage = (invalid_count / len(df_uom_weight)) * 100
    print(f"Found {invalid_count} ({invalid_percentage:.2f}%) rows with non-positive values in pal_weight_kg.")
    
    # Show sample
    display(invalid_weights.head(10))
else:
    print("No Negative or Zero Values Found in pal_weight_kg.")


No Negative or Zero Values Found in pal_weight_kg.


In [54]:
# Define the output file path
uom_weight_file_path = os.path.join(preprocessed_path, "uom_weight.csv")

# Save to CSV
df_uom_weight.to_csv(uom_weight_file_path, index=False)

print(f"Processed UOM Weight data saved to {uom_weight_file_path}")


Processed UOM Weight data saved to ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/uom_weight.csv


### Merging UOM full and weights 

In [55]:
# Load preprocessed UOM Full and UOM Weight data
uom_full_path = os.path.join(preprocessed_path, "uom_full.csv")
uom_weight_path = os.path.join(preprocessed_path, "uom_weight.csv")

df_uom_full = pd.read_csv(uom_full_path)
df_uom_weight = pd.read_csv(uom_weight_path)

# Perform outer join on 'material_code'
df_uom_merged = df_uom_full.merge(df_uom_weight, on="material_code", how="outer")

# Display result
print(f"Outer Join Completed: {len(df_uom_merged)} rows.")
df_uom_merged.head()


Outer Join Completed: 126838 rows.


Unnamed: 0,material_code,material_sk,target_unit_of_measure_code,conversion_numerator,conversion_denumerator,pal_weight_kg
0,1.0,1.0,HL,10.0,1.0,
1,1.0,1.0,PAL,55.0,1.0,
2,2.0,49.0,PAL,55.0,1.0,
3,2.0,49.0,HL,10.0,1.0,
4,3.0,9.0,HL,25.0,3.0,


#### Merge and drop null values

In [56]:
# Count total rows before dropping
total_rows_before = len(df_uom_merged)

# Count missing values before dropping
missing_counts = df_uom_merged.isnull().sum()
missing_total = missing_counts.sum()
missing_percentage = (missing_total / (total_rows_before * len(df_uom_merged.columns))) * 100

print(f"Missing values before dropping: {missing_total} ({missing_percentage:.2f}%)")

# Drop rows where any column is null
df_uom_merged.dropna(how='any', inplace=True)

# Count total rows after dropping
total_rows_after = len(df_uom_merged)
rows_dropped = total_rows_before - total_rows_after
rows_dropped_percentage = (rows_dropped / total_rows_before) * 100

# Count missing values after dropping
missing_total_after = df_uom_merged.isnull().sum().sum()

print(f"Missing values after dropping: {missing_total_after}")
print(f"Rows remaining after drop: {total_rows_after} ({100 - rows_dropped_percentage:.2f}% of original data retained)")
print(f"Rows dropped: {rows_dropped} ({rows_dropped_percentage:.2f}%)")


Missing values before dropping: 101653 (13.36%)
Missing values after dropping: 0
Rows remaining after drop: 25261 (19.92% of original data retained)
Rows dropped: 101577 (80.08%)


#### Save to the uom_df.csv

In [57]:
# Define the output file path
uom_df_file_path = os.path.join(preprocessed_path, "uom_df.csv")

# Save to CSV
df_uom_merged.to_csv(uom_df_file_path, index=False)

print(f"Final Processed UOM Data saved to {uom_df_file_path}")


Final Processed UOM Data saved to ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/uom_df.csv


In [58]:
# Step: Compute PC_HL Conversion Ratio

# Avoid division by zero
df_uom_merged = df_uom_merged[df_uom_merged["conversion_denumerator"] != 0]

# Calculate conversion ratio
df_uom_merged["PC_HL_Conversion"] = df_uom_merged["conversion_denumerator"] / df_uom_merged["conversion_numerator"]

# Display summary
print("PC_HL Conversion Ratio Computed.")
display(df_uom_merged.head(20))


PC_HL Conversion Ratio Computed.


Unnamed: 0,material_code,material_sk,target_unit_of_measure_code,conversion_numerator,conversion_denumerator,pal_weight_kg,PC_HL_Conversion
143,73.0,128.0,HL,5.0,1.0,545.0,0.2
144,73.0,128.0,PAL,20.0,1.0,545.0,0.05
339,171.0,368.0,HL,10.0,3.0,354.6,0.3
340,171.0,368.0,PAL,8.0,1.0,354.6,0.125
343,173.0,282.0,HL,10.0,3.0,354.6,0.3
344,173.0,282.0,PAL,8.0,1.0,354.6,0.125
1662,3290.0,692.0,HL,50.0,3.0,1050.0,0.06
1663,3290.0,692.0,PAL,70.0,1.0,1050.0,0.014286
1664,3291.0,899.0,PAL,70.0,1.0,1048.0,0.014286
1665,3291.0,899.0,HL,50.0,3.0,1048.0,0.06


In [59]:
# Step: Save Processed UOM Data

# Define save path
uom_file_path = os.path.join(preprocessed_path, "uom_df.csv")

# Save to CSV
df_uom_merged.to_csv(uom_file_path, index=False)

print(f"Processed UOM data saved to {uom_file_path}")


Processed UOM data saved to ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/uom_df.csv


### Input Planned Production Preprocessing

In [60]:
# Define the dataset name
file_name = "input_Planned_production.csv"
df_planned_production = dataframes[file_name]

# Define required columns
required_columns_planned_production = [
    "duration", "end_outflow_ts", "material_sk", "material_code",
    "plant_sk", "plant_code", "product_unit", "quantity",
    "start_inflow_ts", "status"
]

# Retain only necessary columns
df_planned_production = df_planned_production[required_columns_planned_production]

# Store updated DataFrame
dataframes[file_name] = df_planned_production

# Display confirmation
print(f"Loaded and retained required columns for {file_name}:")
print(list(df_planned_production.columns))


Loaded and retained required columns for input_Planned_production.csv:
['duration', 'end_outflow_ts', 'material_sk', 'material_code', 'plant_sk', 'plant_code', 'product_unit', 'quantity', 'start_inflow_ts', 'status']


#### Data Type Validation

In [61]:
# Detect Data Type Mismatches

# Define expected data types
expected_dtypes = {
    "plant_sk": "Int64",
    "material_sk": "Int64",
    "material_code": "Int64",
    "plant_code": "object",
    "start_inflow_ts": "datetime64[ns]",
    "end_outflow_ts": "datetime64[ns]",
    "quantity": "float64",
    "product_unit": "object",
    "duration": "float64",
    "status": "object"
}

# Check actual dtypes
dtype_mismatches = {}

for col, expected_dtype in expected_dtypes.items():
    actual_dtype = df_planned_production[col].dtype
    if actual_dtype != expected_dtype:
        dtype_mismatches[col] = (actual_dtype, expected_dtype)

# Display results
if dtype_mismatches:
    print("Data Type Mismatches Found:")
    for col, (actual, expected) in dtype_mismatches.items():
        print(f"   - Column: {col}, Found: {actual}, Expected: {expected}")
else:
    print("All Columns Have Correct Data Types.")


Data Type Mismatches Found:
   - Column: plant_sk, Found: object, Expected: Int64
   - Column: material_sk, Found: object, Expected: Int64
   - Column: material_code, Found: object, Expected: Int64
   - Column: start_inflow_ts, Found: object, Expected: datetime64[ns]
   - Column: end_outflow_ts, Found: object, Expected: datetime64[ns]
   - Column: quantity, Found: object, Expected: float64
   - Column: duration, Found: object, Expected: float64


#### Handling data type mismatches 

In [62]:
# Fix Data Type Mismatches

# Convert to correct types
df_planned_production["plant_sk"] = pd.to_numeric(df_planned_production["plant_sk"], errors="coerce")
df_planned_production["material_sk"] = pd.to_numeric(df_planned_production["material_sk"], errors="coerce")
df_planned_production["material_code"] = pd.to_numeric(df_planned_production["material_code"], errors="coerce")
df_planned_production["quantity"] = pd.to_numeric(df_planned_production["quantity"], errors="coerce")
df_planned_production["duration"] = pd.to_numeric(df_planned_production["duration"], errors="coerce")

# Convert timestamps to datetime format
df_planned_production["start_inflow_ts"] = pd.to_datetime(df_planned_production["start_inflow_ts"], errors="coerce")
df_planned_production["end_outflow_ts"] = pd.to_datetime(df_planned_production["end_outflow_ts"], errors="coerce")

print("Data Types Fixed.")


Data Types Fixed.


#### Missing Values

In [63]:
# Detect Missing Values
print(f"Checking Missing Values in {file_name}\n")

# Count missing values
missing_count = df_planned_production.isnull().sum()
missing_percentage = (missing_count / len(df_planned_production)) * 100

# Filter only columns with missing values
missing_data = pd.DataFrame({'Missing_Count': missing_count, 'Missing_Percentage': missing_percentage})
missing_data = missing_data[missing_data["Missing_Count"] > 0]

# Display results
if not missing_data.empty:
    missing_data["Missing_Percentage"] = missing_data["Missing_Percentage"].apply(lambda x: f"{x:.2f}%")
    print(f"Missing Values Found in {file_name}:")
    display(missing_data)
else:
    print(f"No Missing Values Found in {file_name}.")


Checking Missing Values in input_Planned_production.csv

Missing Values Found in input_Planned_production.csv:


Unnamed: 0,Missing_Count,Missing_Percentage
material_code,65,10.20%


#### Duplicates

In [64]:
# Detect Duplicate Entries Based on (plant_sk, material_sk, start_inflow_ts, end_outflow_ts)

# Convert timestamps to datetime for proper validation
df_planned_production["start_inflow_ts"] = pd.to_datetime(df_planned_production["start_inflow_ts"], errors="coerce")
df_planned_production["end_outflow_ts"] = pd.to_datetime(df_planned_production["end_outflow_ts"], errors="coerce")

# Detect duplicates based on material, plant, and timestamps
duplicate_entries = df_planned_production.duplicated(
    subset=["plant_sk", "material_sk", "start_inflow_ts", "end_outflow_ts"], keep=False
)

# Count and percentage of duplicates
duplicate_count = duplicate_entries.sum()
duplicate_percentage = (duplicate_count / len(df_planned_production)) * 100

print(f"Duplicate (plant_sk, material_sk, start_inflow_ts, end_outflow_ts) pairs found: {duplicate_count} ({duplicate_percentage:.2f}%)")

# Display sample duplicate entries
if duplicate_count > 0:
    display(df_planned_production[duplicate_entries].sort_values(["plant_sk", "material_sk", "start_inflow_ts"]).head(10))
else:
    print("No duplicates found.")


Duplicate (plant_sk, material_sk, start_inflow_ts, end_outflow_ts) pairs found: 0 (0.00%)
No duplicates found.


In [65]:
# Group by duplicate keys and check if other columns have different values
duplicate_groups = df_planned_production[duplicate_entries].groupby(
    ["plant_sk", "material_sk", "start_inflow_ts", "end_outflow_ts"]
).agg({
    "quantity": "nunique",
    "status": "nunique",
    "product_unit": "nunique",
    "duration": "nunique"
})

# Display cases where values differ in any column
mismatched_values = duplicate_groups[(duplicate_groups > 1).any(axis=1)]
if not mismatched_values.empty:
    print("Some duplicates have different values in 'quantity', 'status', 'product_unit', or 'duration'.")
    display(mismatched_values)
else:
    print("All duplicate rows are exact copies (safe to remove).")


All duplicate rows are exact copies (safe to remove).




#### Filtering based on latest entries 

In [66]:
# Ensure timestamps are in datetime format
df_planned_production["start_inflow_ts"] = pd.to_datetime(df_planned_production["start_inflow_ts"])
df_planned_production["end_outflow_ts"] = pd.to_datetime(df_planned_production["end_outflow_ts"])

# Sort by start_inflow_ts and drop duplicates, keeping the latest entry
df_planned_production = df_planned_production.sort_values("start_inflow_ts").drop_duplicates(
    subset=["plant_sk", "material_sk", "start_inflow_ts", "end_outflow_ts"],
    keep="last"
)

print("Kept the latest planned production entry based on start_inflow_ts.")


Kept the latest planned production entry based on start_inflow_ts.




#### Verification

In [67]:
remaining_duplicates = df_planned_production.duplicated(
    subset=["plant_sk", "material_sk", "start_inflow_ts", "end_outflow_ts"], keep=False
)
print(f"Remaining duplicates: {remaining_duplicates.sum()}")


Remaining duplicates: 0


In [68]:
# Check unique values in product_unit column
unique_units = df_planned_production["product_unit"].unique()
print("Unique product units:", unique_units)


Unique product units: ['HLT' 'PCE' 'hl' 'min' 'CIP' 'minutes']


In [69]:
# Count rows that will be removed
non_pce_rows = df_planned_production[df_planned_production["product_unit"] != "PCE"]
non_pce_count = len(non_pce_rows)
non_pce_percentage = (non_pce_count / len(df_planned_production)) * 100

print(f"Rows with non-PCE product units: {non_pce_count} ({non_pce_percentage:.2f}%)")
display(non_pce_rows["product_unit"].value_counts())


Rows with non-PCE product units: 462 (72.53%)


product_unit
HLT        429
CIP         11
hl          10
minutes      7
min          5
Name: count, dtype: int64

#### Filtering based on product unit 'PCE'

In [70]:
# Apply Filtering to Keep Only PCE Product Units
initial_count_planned_production = len(df_planned_production)
df_planned_production = df_planned_production[df_planned_production["product_unit"] == "PCE"]

# Calculate remaining rows and percentage
remaining_rows = len(df_planned_production)
remaining_percentage = (remaining_rows / initial_count_planned_production) * 100

# Display confirmation
print(f"Filter Applied: Kept {remaining_rows} rows ({remaining_percentage:.2f}%) out of {initial_count_planned_production}.")


Filter Applied: Kept 175 rows (27.47%) out of 637.


#### Multi-Day production filtering

In [71]:
# Detect Multi-Day Production Events  

import pandas as pd

# Identify rows where the start and end timestamps fall on different calendar days
df_planned_production["start_inflow_ts"] = pd.to_datetime(df_planned_production["start_inflow_ts"], errors="coerce")
df_planned_production["end_outflow_ts"] = pd.to_datetime(df_planned_production["end_outflow_ts"], errors="coerce")

# Identify multi-day production (different dates for start and end)
multi_day_production = df_planned_production[
    df_planned_production["start_inflow_ts"].dt.date != df_planned_production["end_outflow_ts"].dt.date
]

# Count and percentage of multi-day records
multi_day_count = len(multi_day_production)
multi_day_percentage = (multi_day_count / len(df_planned_production)) * 100

print(f"Multi-Day Production Events Found: {multi_day_count} ({multi_day_percentage:.2f}%)")

# Display a few samples if found
if multi_day_count > 0:
    display(multi_day_production.head(10))
else:
    print("No Multi-Day Production Events Found.")


Multi-Day Production Events Found: 101 (57.71%)


Unnamed: 0,duration,end_outflow_ts,material_sk,material_code,plant_sk,plant_code,product_unit,quantity,start_inflow_ts,status
432,3056.0,2025-04-11 08:56:00,35047,53478.0,73,BE03,PCE,69280.0,2025-04-09 06:00:00,running
134,1112.0,2025-04-10 00:32:00,3505,5688.0,2566,DE02,PCE,21000.0,2025-04-09 06:00:00,processorder
204,1293.0,2025-04-10 03:33:00,5091,6485.0,4766,DE06,PCE,16000.0,2025-04-09 06:00:00,processorder
250,1924.0,2025-04-10 14:04:00,30442,61743.0,73,BE03,PCE,46504.0,2025-04-09 06:00:00,running
470,1235.0,2025-04-10 02:35:00,42945,73296.0,4766,DE06,PCE,25000.0,2025-04-09 06:00:00,processorder
337,1387.0,2025-04-10 05:07:00,1877515,107699.0,142,BE04,PCE,38146.0,2025-04-09 06:00:00,running
20,1405.0,2025-04-10 05:25:00,1882678,103544.0,2566,DE02,PCE,28500.0,2025-04-09 06:00:00,processorder
354,1184.0,2025-04-10 01:44:00,4977,21389.0,4766,DE06,PCE,6300.0,2025-04-09 06:00:00,processorder
499,1905.0,2025-04-10 13:45:00,1889501,100626.0,142,BE04,PCE,1143.400024,2025-04-09 06:00:00,processorder
421,1193.0,2025-04-10 01:53:00,1880479,107911.0,142,BE04,PCE,64455.0,2025-04-09 06:00:00,running


In [72]:
df_planned_production["duration"] = pd.to_numeric(df_planned_production["duration"], errors="coerce")
df_planned_production["quantity"] = pd.to_numeric(df_planned_production["quantity"], errors="coerce")


#### Splittinng Multi-day production into multiple rows

In [73]:
import pandas as pd
from datetime import datetime, timedelta

def split_rows_adjusted(df):
    """
    Function to split multi-day production into multiple rows
    and allocate quantity proportionally using the existing 'duration' column.
    """
    new_rows = []

    for _, row in df.iterrows():
        start_ts = row["start_inflow_ts"]
        end_ts = row["end_outflow_ts"]
        total_duration = row["duration"]  # Use given duration column
        total_quantity = row["quantity"]

        # Debugging Print Statements
        print(f"\nDEBUG: Processing Row -> Start: {start_ts}, End: {end_ts}")
        print(f"DEBUG: Total Duration (minutes): {total_duration}, Total Quantity: {total_quantity}")

        # Ensure total_duration is positive
        if total_duration <= 0:
            print(f"Warning: Invalid duration detected for row: {row}")
            continue

        current_ts = start_ts
        allocated_quantity = 0  # Track allocated quantity for correction

        while current_ts < end_ts:
            midnight_ts = datetime.combine(current_ts.date() + timedelta(days=1), datetime.min.time())

            # Determine the end time for the current row
            split_end_ts = min(midnight_ts, end_ts)

            # Compute the duration for the current row from the 'duration' column
            duration_today = (split_end_ts - current_ts).total_seconds() / 60  # Convert to minutes

            # Safe Division (Avoid ZeroDivisionError)
            quantity_today = (duration_today / max(total_duration, 1e-6)) * total_quantity
            allocated_quantity += quantity_today  # Track total allocated quantity

            print(f"DEBUG: duration_today = {duration_today}, quantity_today = {quantity_today}")

            # Append new row
            new_rows.append({
                "plant_sk": row["plant_sk"],
                "material_sk": row["material_sk"],
                "material_code": row["material_code"],
                "plant_code": row["plant_code"],
                "start_inflow_ts": current_ts,
                "end_outflow_ts": split_end_ts,
                "duration": duration_today,
                "quantity": quantity_today,
                "product_unit": row["product_unit"],
                "status": row["status"]
            })

            # Move to the next segment
            current_ts = split_end_ts

        # **Apply Correction to Last Row**
        quantity_mismatch = total_quantity - allocated_quantity
        if abs(quantity_mismatch) > 1e-6:  # If there's a mismatch
            new_rows[-1]["quantity"] += quantity_mismatch  # Adjust last row to absorb mismatch
            print(f"Correction Applied: Adjusted last row by {quantity_mismatch}")

    return pd.DataFrame(new_rows)

# Apply the function
df_multi_day_split = split_rows_adjusted(multi_day_production)



DEBUG: Processing Row -> Start: 2025-04-09 06:00:00, End: 2025-04-11 08:56:00


DEBUG: Total Duration (minutes): 3056.0, Total Quantity: 69280.0
DEBUG: duration_today = 1080.0, quantity_today = 24483.769633507854
DEBUG: duration_today = 1440.0, quantity_today = 32645.02617801047
DEBUG: duration_today = 536.0, quantity_today = 12151.204188481675

DEBUG: Processing Row -> Start: 2025-04-09 06:00:00, End: 2025-04-10 00:32:00
DEBUG: Total Duration (minutes): 1112.0, Total Quantity: 21000.0
DEBUG: duration_today = 1080.0, quantity_today = 20395.68345323741
DEBUG: duration_today = 32.0, quantity_today = 604.31654676259

DEBUG: Processing Row -> Start: 2025-04-09 06:00:00, End: 2025-04-10 03:33:00
DEBUG: Total Duration (minutes): 1293.0, Total Quantity: 16000.0
DEBUG: duration_today = 1080.0, quantity_today = 13364.269141531322
DEBUG: duration_today = 213.0, quantity_today = 2635.7308584686775

DEBUG: Processing Row -> Start: 2025-04-09 06:00:00, End: 2025-04-10 14:04:00
DEBUG: Total Duration (minutes): 1924.0, Total Quantity: 46504.0
DEBUG: duration_today = 1080.0, qua


DEBUG: Processing Row -> Start: 2025-04-09 15:40:00, End: 2025-04-10 16:45:00
DEBUG: Total Duration (minutes): 1505.0, Total Quantity: 33011.0
DEBUG: duration_today = 500.0, quantity_today = 10967.109634551496
DEBUG: duration_today = 1005.0, quantity_today = 22043.890365448508

DEBUG: Processing Row -> Start: 2025-04-09 16:32:00, End: 2025-04-10 09:11:00
DEBUG: Total Duration (minutes): 999.0, Total Quantity: 20161.0
DEBUG: duration_today = 448.0, quantity_today = 9041.169169169169
DEBUG: duration_today = 551.0, quantity_today = 11119.830830830831

DEBUG: Processing Row -> Start: 2025-04-09 16:45:00, End: 2025-04-10 02:44:00
DEBUG: Total Duration (minutes): 599.0, Total Quantity: 13944.0
DEBUG: duration_today = 435.0, quantity_today = 10126.27712854758
DEBUG: duration_today = 164.0, quantity_today = 3817.722871452421

DEBUG: Processing Row -> Start: 2025-04-09 17:00:00, End: 2025-04-11 07:32:00
DEBUG: Total Duration (minutes): 2312.0, Total Quantity: 109092.890625
DEBUG: duration_toda

#### Merging one-day and multi-day rows

In [74]:
# Remove original multi-day rows and add the new split rows
df_planned_production = df_planned_production.drop(multi_day_production.index).reset_index(drop=True)
df_planned_production = pd.concat([df_planned_production, df_multi_day_split], ignore_index=True)

print(f"Multi-day production split applied. Final rows: {len(df_planned_production)}")


Multi-day production split applied. Final rows: 297


#### Final saving to planned production

In [75]:
planned_production_path = os.path.join(preprocessed_path, "planned_production.csv")
df_planned_production.to_csv(planned_production_path, index=False)
print(f"Processed Planned Production data saved to {planned_production_path}")


Processed Planned Production data saved to ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/planned_production.csv


#### Verification of multi-day splits

In [76]:
# Initial and final row count comparison
print(f"Initial Rows Before Split: {len(multi_day_production)}")
print(f"Final Rows After Split: {len(df_planned_production)}")


Initial Rows Before Split: 101
Final Rows After Split: 297


#### Verifying start inflow < end outflow

In [77]:
# Check if start_inflow_ts is always before end_outflow_ts
invalid_timestamps = df_planned_production[df_planned_production["start_inflow_ts"] >= df_planned_production["end_outflow_ts"]]

if not invalid_timestamps.empty:
    print("Found incorrect timestamps after splitting:")
    display(invalid_timestamps)
else:
    print("All timestamps are correctly ordered.")


All timestamps are correctly ordered.


#### Quantity mismatches before and after splits

In [78]:
import numpy as np

# Ensure rounding happens at every calculation step
df_planned_production["quantity"] = df_planned_production["quantity"].round(4)

# Compute the total quantity before and after processing
original_qty = multi_day_production.groupby(["material_sk", "plant_sk"])["quantity"].sum()
split_qty = df_planned_production.groupby(["material_sk", "plant_sk"])["quantity"].sum()

# Compare differences and allow small floating-point errors
quantity_mismatch = (original_qty - split_qty).abs()

# Set a small threshold for floating-point precision issues
mismatch_threshold = 1e-4  # Adjust this if needed

# Check if mismatches are significant
significant_mismatches = quantity_mismatch[quantity_mismatch > mismatch_threshold]

if significant_mismatches.sum() > 0:
    print("Quantity mismatch detected after rounding:")
    display(significant_mismatches)
else:
    print("Total quantity is correctly distributed after rounding.")


Quantity mismatch detected after rounding:




material_sk  plant_sk
1667         4766        13000.000000
2130         2566        14000.000000
2580         2566         5000.000000
13443        4766         1700.000000
16787        2566        14000.000000
37306        142             0.000100
41166        73              0.000125
41580        142         20010.000000
42945        4766        10000.000000
47482        1810        30717.000100
49997        482          8129.000000
56863        2566        10000.000000
1879911      142          1043.000000
1892023      142          3651.000000
1895427      2566        10000.000000
1902576      2566        18000.000000
1978413      1810        16802.000000
1978473      1810        14999.999900
Name: quantity, dtype: float64

#### Detecting Overlaps in times 

In [79]:
overlapping_production = df_planned_production.sort_values(["material_sk", "start_inflow_ts"]).duplicated(
    subset=["material_sk", "plant_sk", "start_inflow_ts", "end_outflow_ts"], keep=False
)

if overlapping_production.any():
    print("Overlapping production periods found:")
    display(df_planned_production[overlapping_production])
else:
    print("No overlapping production periods detected.")


No overlapping production periods detected.


#### Transformation based on product unit

In [80]:
# Detect Product Units for Transformation  

# Check unique product_unit values before transformation  
unique_units = df_planned_production["product_unit"].unique()  
print("Unique product_unit values before transformation:", unique_units)  

# Count rows where transformation is needed  
pce_rows = df_planned_production[df_planned_production["product_unit"] == "PCE"]  
pce_count = len(pce_rows)  
pce_percentage = (pce_count / len(df_planned_production)) * 100  

print(f"Rows requiring transformation (PCE to HL): {pce_count} ({pce_percentage:.2f}%)")  


Unique product_unit values before transformation: ['PCE']
Rows requiring transformation (PCE to HL): 297 (100.00%)


#### Converting PCE to HL

In [81]:
# Filter UOM table to retain only HL conversion factors
df_uom_filtered = df_uom_merged[df_uom_merged["target_unit_of_measure_code"] == "HL"]

# Merge with the filtered UOM table
df_planned_production = df_planned_production.merge(
    df_uom_filtered[["material_sk", "PC_HL_Conversion"]],
    on="material_sk",
    how="left"
)

# Convert PCE to HL using the conversion ratio
df_planned_production["Production_HL"] = df_planned_production.apply(
    lambda row: row["quantity"] * row["PC_HL_Conversion"]
    if row["product_unit"] == "PCE" else row["quantity"],
    axis=1
)

# Drop the original "quantity" column and rename the transformed one
#df_planned_production.drop(columns=["quantity"], inplace=True)
df_planned_production.rename(columns={"Production_HL": "quantity_HL"}, inplace=True)

print("Product unit transformation applied successfully.")


Product unit transformation applied successfully.


#### Verifying quantity after conversion by detecting nulls 

In [82]:
# Step: Detect Null Values in Production_HL
missing_production_hl = df_planned_production["quantity_HL"].isnull().sum()
missing_percentage = (missing_production_hl / len(df_planned_production)) * 100

print(f"Missing Values in quantity: {missing_production_hl} ({missing_percentage:.2f}%)")

# Display affected rows if any exist
if missing_production_hl > 0:
    display(df_planned_production[df_planned_production["quantity_HL"].isnull()])
else:
    print("No Missing Values in Quantity.")


Missing Values in quantity: 0 (0.00%)
No Missing Values in Quantity.


In [83]:
# Drop rows where quantity is null
df_planned_production = df_planned_production.dropna(subset=["quantity_HL"])

# Verify that missing values are removed
missing_production_hl = df_planned_production["quantity_HL"].isnull().sum()
print(f"Missing Values in quantity after dropping: {missing_production_hl}")


Missing Values in quantity after dropping: 0


#### Defining current runtime

In [84]:
from datetime import datetime, timedelta

# Define the reference run time (assuming current time for now)
run_time_naive = datetime.now()

# Display the reference time
print(f"Run Time (Naive): {run_time_naive}")


Run Time (Naive): 2025-04-10 13:06:16.072256


#### Filtering Ongoing or Future Production

In [85]:
# Convert timestamps to datetime format for comparison
df_planned_production["start_inflow_ts"] = pd.to_datetime(df_planned_production["start_inflow_ts"], errors="coerce")
df_planned_production["end_outflow_ts"] = pd.to_datetime(df_planned_production["end_outflow_ts"], errors="coerce")

# Define the filtering conditions
ongoing_production = df_planned_production[
    (df_planned_production["start_inflow_ts"] >= run_time_naive - timedelta(hours=3)) &
    (df_planned_production["end_outflow_ts"] <= (run_time_naive + timedelta(days=1)))
]

# Display results
print(f"Ongoing/Future Production Filter Applied: {len(ongoing_production)} rows retained.")

# Store filtered dataset for ongoing production
ongoing_production_file = os.path.join(preprocessed_path, "planned_production.csv")
ongoing_production.to_csv(ongoing_production_file, index=False)
print(f"Ongoing Production Data Saved to: {ongoing_production_file}")


Ongoing/Future Production Filter Applied: 84 rows retained.
Ongoing Production Data Saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/planned_production.csv


#### Filtering Completed Production

In [86]:
# Define the filtering conditions
completed_production = df_planned_production[
    (df_planned_production["end_outflow_ts"] <= run_time_naive - timedelta(hours=3)) &
    (df_planned_production["start_inflow_ts"] >= run_time_naive.replace(hour=0, minute=0, second=0, microsecond=0))
]

# Display results
print(f"Completed Production Filter Applied: {len(completed_production)} rows retained.")

# Store filtered dataset for completed production
completed_production_file = os.path.join(preprocessed_path, "completed_production.csv")
completed_production.to_csv(completed_production_file, index=False)
print(f"Completed Production Data Saved to: {completed_production_file}")


Completed Production Filter Applied: 39 rows retained.
Completed Production Data Saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/completed_production.csv


### Input Actual Production Preprocessing

In [87]:
# Define dataset name
file_name = "input_Actual_production.csv"
df_actual_production = dataframes[file_name]

# Retain only relevant columns
required_columns_actual_production = [
    "plant_sk", "plant_code", "material_sk", "material_code", "actual_quantity"
]
df_actual_production = df_actual_production[required_columns_actual_production]

# Store updated DataFrame
dataframes[file_name] = df_actual_production

# Display confirmation
print(f"Loaded and retained required columns for {file_name}:")
print(list(df_actual_production.columns))


Loaded and retained required columns for input_Actual_production.csv:
['plant_sk', 'plant_code', 'material_sk', 'material_code', 'actual_quantity']


In [88]:
df_actual_production.describe()

Unnamed: 0,plant_sk,plant_code,material_sk,material_code,actual_quantity
count,66,66,66,66,66.0
unique,12,12,61,61,65.0
top,2566,DE02,1814,5687,5520.0
freq,13,13,2,2,2.0


#### Missing Values

In [89]:
# Calculate missing values
missing_values = df_actual_production.isnull().sum()
total_rows = len(df_actual_production)

# Calculate percentage of missing values
missing_percentage = (missing_values / total_rows) * 100

# Combine both counts and percentages
missing_summary = pd.DataFrame({
    "Missing Values": missing_values,
    "Percentage (%)": missing_percentage
})

# Display results
if missing_values.any():
    print("Missing Values Found:")
    display(missing_summary[missing_summary["Missing Values"] > 0])
else:
    print(f"No Missing Values Found in input_Actual_production.csv. (Total Rows: {total_rows})")


No Missing Values Found in input_Actual_production.csv. (Total Rows: 66)


#### Duplicates

In [90]:
#  Detecting Duplicates in input_Actual_production.csv

# Find duplicates based on plant_sk and material_sk
duplicate_rows = df_actual_production.duplicated(subset=["plant_sk", "material_sk"], keep=False)

# Count duplicates
num_duplicates = duplicate_rows.sum()
duplicate_percentage = (num_duplicates / len(df_actual_production)) * 100

# Display results
print(f"Duplicate (plant_sk, material_sk) pairs found in {file_name}: {num_duplicates} ({duplicate_percentage:.2f}%)")

# Show duplicate entries if found
if num_duplicates > 0:
    display(df_actual_production[duplicate_rows].sort_values(["plant_sk", "material_sk"]))
else:
    print("No duplicates found based on plant_sk and material_sk.")


Duplicate (plant_sk, material_sk) pairs found in input_Actual_production.csv: 8 (12.12%)


Unnamed: 0,plant_sk,plant_code,material_sk,material_code,actual_quantity
46,2566,DE02,1869181,107861,6402.0
51,2566,DE02,1869181,107861,2436.0
58,2566,DE02,1895427,104114,140.0
61,2566,DE02,1895427,104114,7910.0
63,2566,DE02,1902576,105356,189.0
65,2566,DE02,1902576,105356,8568.0
14,2566,DE02,2130,5691,5280.0
24,2566,DE02,2130,5691,1200.0


#### Handling duplicate values by summing up the actual quantity

In [91]:
# Handling Duplicates in input_Actual_production.csv

# Group by plant_sk and material_sk and sum actual_quantity
df_actual_production = df_actual_production.groupby(["plant_sk", "material_sk"], as_index=False).agg({
    "plant_code": "first", 
    "material_code": "first",  
    "actual_quantity": "sum"  
})

# Store the cleaned data
dataframes["input_Actual_production.csv"] = df_actual_production

# Display result
print(f"Duplicates handled. Remaining rows in input_Actual_production.csv: {len(df_actual_production)}")
display(df_actual_production.head())


Duplicates handled. Remaining rows in input_Actual_production.csv: 62




Unnamed: 0,plant_sk,material_sk,plant_code,material_code,actual_quantity
0,142,1868373,BE04,108083,10320.0
1,142,1877515,BE04,107699,12600.0
2,142,1879911,BE04,107281,16005.0
3,142,1880479,BE04,107911,3016.0
4,142,1885428,BE04,106369,28812.0


#### Data type validation

In [92]:
# Data Type Validation for input_Actual_production.csv

# Define expected data types
expected_dtypes = {
    "plant_sk": "Int64",
    "plant_code": "object",
    "material_sk": "Int64",
    "material_code": "Int64",
    "actual_quantity": "float64"
}

# Check actual dtypes
dtype_mismatches = {}

for col, expected_dtype in expected_dtypes.items():
    actual_dtype = df_actual_production[col].dtype
    if actual_dtype != expected_dtype:
        dtype_mismatches[col] = (actual_dtype, expected_dtype)

# Display results
if dtype_mismatches:
    print("Data Type Mismatches Found:")
    for col, (actual, expected) in dtype_mismatches.items():
        print(f"   - Column: {col}, Found: {actual}, Expected: {expected}")
else:
    print("All Columns Have Correct Data Types.")


Data Type Mismatches Found:
   - Column: plant_sk, Found: object, Expected: Int64
   - Column: material_sk, Found: object, Expected: Int64
   - Column: material_code, Found: object, Expected: Int64
   - Column: actual_quantity, Found: object, Expected: float64


#### Fixed data type validation errors

In [93]:
# Handling Data Type Mismatches for input_Actual_production.csv

# Convert data types
df_actual_production["plant_sk"] = pd.to_numeric(df_actual_production["plant_sk"], errors="coerce")
df_actual_production["material_sk"] = pd.to_numeric(df_actual_production["material_sk"], errors="coerce")
df_actual_production["material_code"] = pd.to_numeric(df_actual_production["material_code"], errors="coerce")
df_actual_production["actual_quantity"] = pd.to_numeric(df_actual_production["actual_quantity"], errors="coerce")

# Store updated DataFrame
dataframes["input_Actual_production.csv"] = df_actual_production

print("Data Types Fixed.")


Data Types Fixed.


#### Non-Negative validation

In [94]:
#  Detect Negative Values in actual_quantity

# Check for negative values in actual_quantity
negative_values = df_actual_production[df_actual_production["actual_quantity"] < 0]

# Display results
if not negative_values.empty:
    print(f"Negative Values Found in actual_quantity: {len(negative_values)} ({(len(negative_values) / len(df_actual_production)) * 100:.2f}%)")
    display(negative_values)
else:
    print("No Negative Values Found in actual_quantity.")


No Negative Values Found in actual_quantity.


#### Verifying same material production in same plant for multiple times

In [95]:
# Checking if the same material is produced multiple times in the same plant
duplicate_production = df_actual_production.groupby(['plant_sk', 'material_sk']).size().reset_index(name='count')
duplicate_production = duplicate_production[duplicate_production['count'] > 1]

if not duplicate_production.empty:
    print("Materials produced multiple times in the same plant:")
    display(duplicate_production)
else:
    print("No duplicate material production found in the same plant.")


No duplicate material production found in the same plant.


#### Converting PC to HL using UOM PC_HL_conversion

In [96]:
# Merge actual production with UOM data to get PC_HL_Conversion
df_actual_production = df_actual_production.merge(
    df_uom_merged[["material_sk", "PC_HL_Conversion"]],
    on="material_sk",
    how="left"
)

# Check for missing conversion ratios
missing_conversion = df_actual_production["PC_HL_Conversion"].isna().sum()
missing_percentage = (missing_conversion / len(df_actual_production)) * 100

print(f"Missing PC_HL_Conversion Ratios: {missing_conversion} rows ({missing_percentage:.2f}%)")


Missing PC_HL_Conversion Ratios: 3 rows (2.48%)


In [97]:
# Convert actual production to HL
df_actual_production["actual_quantity_HL"] = df_actual_production["actual_quantity"] * df_actual_production["PC_HL_Conversion"]

# Drop the original actual_quantity column (optional)
df_actual_production.drop(columns=["actual_quantity"], inplace=True)

# Rename the new column
df_actual_production.rename(columns={"actual_quantity_HL": "Production_HL"}, inplace=True)

print("Production quantity successfully converted to HL.")


Production quantity successfully converted to HL.


In [98]:
# Display rows with missing PC_HL_Conversion
missing_conversion_rows = df_actual_production[df_actual_production["PC_HL_Conversion"].isna()]
display(missing_conversion_rows)


Unnamed: 0,plant_sk,material_sk,plant_code,material_code,PC_HL_Conversion,Production_HL
24,16369,16135682,CI01,9950043,,
25,16417,16135729,CI02,9950900,,
26,16417,5579437,CI02,9950903,,


In [99]:
# Remove rows with missing conversion ratios
df_actual_production = df_actual_production.dropna(subset=["PC_HL_Conversion"]).reset_index(drop=True)

print(f"Removed {missing_conversion} rows with missing conversion ratios. Remaining rows: {len(df_actual_production)}")


Removed 3 rows with missing conversion ratios. Remaining rows: 118


In [100]:
# Load completed_production.csv
completed_production_path = os.path.join(preprocessed_path, "completed_production.csv")
df_completed_production = pd.read_csv(completed_production_path)

# Concatenate actual and completed production
df_combined_production = pd.concat([df_actual_production, df_completed_production], ignore_index=True)

print(f"DataFrames concatenated. Total rows before deduplication: {len(df_combined_production)}")


DataFrames concatenated. Total rows before deduplication: 157


In [101]:
# Detect duplicate production entries
duplicate_rows = df_combined_production.duplicated(
    subset=["plant_sk", "plant_code", "material_sk", "material_code"], keep=False
)

# Count duplicates
duplicate_count = duplicate_rows.sum()
duplicate_percentage = (duplicate_count / len(df_combined_production)) * 100

print(f"Found {duplicate_count} duplicate rows ({duplicate_percentage:.2f}%).")


Found 147 duplicate rows (93.63%).


In [102]:
# Drop duplicates, keeping the first occurrence
df_combined_production = df_combined_production.drop_duplicates(
    subset=["plant_sk", "plant_code", "material_sk", "material_code"], keep="first"
).reset_index(drop=True)

print(f"Duplicates removed. Remaining rows: {len(df_combined_production)}")


Duplicates removed. Remaining rows: 69


#### Filtering to required columns 

In [103]:
# Define the required columns
required_columns_production = ["plant_sk", "plant_code", "material_sk", "material_code", "Production_HL"]

# Subset the DataFrame
df_actual_production = df_combined_production[required_columns_production]

# Display confirmation
print(f"Retained required columns for actual_production.csv:")
print(list(df_actual_production.columns))


Retained required columns for actual_production.csv:
['plant_sk', 'plant_code', 'material_sk', 'material_code', 'Production_HL']


#### Dropping Null values 

In [104]:
# Detect missing values before dropping
missing_before = df_actual_production.isnull().sum().sum()
total_rows_before = len(df_actual_production)
missing_percentage = (missing_before / (total_rows_before * len(df_actual_production.columns))) * 100

print(f"Missing values before dropping: {missing_before} ({missing_percentage:.2f}%)")

# Drop rows where any of the selected columns have null values
df_actual_production = df_actual_production.dropna(how="any").reset_index(drop=True)

# Detect missing values after dropping
missing_after = df_actual_production.isnull().sum().sum()
total_rows_after = len(df_actual_production)
remaining_percentage = (total_rows_after / total_rows_before) * 100

print(f"Missing values after dropping: {missing_after} (0.00%)")
print(f"Rows remaining after drop: {total_rows_after} ({remaining_percentage:.2f}% of original rows retained)")


Missing values before dropping: 14 (4.06%)
Missing values after dropping: 0 (0.00%)
Rows remaining after drop: 55 (79.71% of original rows retained)


#### Final saving preprocessed data to actual production

In [105]:
# Define save path
actual_production_path = os.path.join(preprocessed_path, "actual_production.csv")

# Save to CSV
df_actual_production.to_csv(actual_production_path, index=False)

print(f"Processed Actual Production Data Saved to: {actual_production_path}")


Processed Actual Production Data Saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/actual_production.csv




### Input Planned Loads Preprocessing

In [106]:
# Define dataset name
file_name = "input_Planned_loads.csv"
df_planned_loads = dataframes[file_name]
required_columns_planned_loads = [
    "delivery_sk", "actual_quantity", "unit_of_measure_code", "plant_code",
    "transportation_planning_priority", "material_sk", "material_code", "load_id",
    "origin_slot_arrival", "origin_slot_departure", "destination_slot_arrival",
    "destination_slot_departure", "RFRC_NUM12", "origin", "destination", "movement_type"
]
df_planned_loads = df_planned_loads[required_columns_planned_loads]

# Store updated DataFrame
dataframes[file_name] = df_planned_loads
print(f"Loaded and retained required columns for {file_name}:")
print(list(df_planned_loads.columns))

# Show first few rows
df_planned_loads.head()


Loaded and retained required columns for input_Planned_loads.csv:
['delivery_sk', 'actual_quantity', 'unit_of_measure_code', 'plant_code', 'transportation_planning_priority', 'material_sk', 'material_code', 'load_id', 'origin_slot_arrival', 'origin_slot_departure', 'destination_slot_arrival', 'destination_slot_departure', 'RFRC_NUM12', 'origin', 'destination', 'movement_type']


Unnamed: 0,delivery_sk,actual_quantity,unit_of_measure_code,plant_code,transportation_planning_priority,material_sk,material_code,load_id,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,RFRC_NUM12,origin,destination,movement_type
0,81728721,4680.0,PC,GB01,1,1977870,99150,34718714,2025-04-10 08:00:00,2025-04-10 08:30:00,,,21820530,GB01,29585614,SO
1,81649724,26.0,PC,GB67,1,1869080,108741,34714811,2025-04-10 16:00:00,2025-04-10 17:00:00,,,21806479,GB67,29586301,SO
2,81697215,450.0,PC,GB02,1,1978473,99148,34716589,2025-04-10 15:00:00,2025-04-10 15:30:00,,,21816790,GB02,29586299,SO
3,81687441,195.0,PC,GB01,0,1877163,105485,34716430,2025-04-10 17:00:00,2025-04-10 17:30:00,,,21809360,GB01,29586017,SO
4,81608824,8.0,PC,GB02,12,10093,15519,34711607,2025-04-11 20:00:00,2025-04-11 20:30:00,2025-04-12 03:00:00,2025-04-12 03:30:00,4508593686,GB02,GB01,STO


#### Missing values 

In [107]:
# Detect missing values in each column
missing_values = df_planned_loads.isnull().sum()
missing_percentage = (missing_values / len(df_planned_loads)) * 100

# Display missing values where count > 0
missing_df = missing_values[missing_values > 0].to_frame(name="Missing Count")
missing_df["Percentage"] = missing_percentage[missing_values > 0]

if not missing_df.empty:
    print("Missing Values Detected:")
    display(missing_df)
else:
    print("No Missing Values Found in input_Planned_loads.csv.")


Missing Values Detected:


Unnamed: 0,Missing Count,Percentage
destination_slot_arrival,2679,92.698962
destination_slot_departure,2679,92.698962
RFRC_NUM12,185,6.401384


#### Data type validation

In [108]:
# Define expected data types
expected_dtypes = {
    "delivery_sk": "int64",
    "actual_quantity": "float64",
    "unit_of_measure_code": "object",
    "plant_code": "object",
    "transportation_planning_priority": "int64",
    "material_sk": "int64",
    "material_code": "int64",
    "load_id": "int64",
    "origin_slot_arrival": "datetime64[ns]",
    "origin_slot_departure": "datetime64[ns]",
    "destination_slot_arrival": "datetime64[ns]",
    "destination_slot_departure": "datetime64[ns]",
    "RFRC_NUM12": "float64",
    "origin": "object",
    "destination": "object",
    "movement_type": "object"
}

# Identify mismatches
mismatched_types = {
    col: df_planned_loads[col].dtype
    for col in expected_dtypes
    if df_planned_loads[col].dtype != expected_dtypes[col]
}

# Display results
if mismatched_types:
    print("Data Type Mismatches Found:")
    for col, dtype in mismatched_types.items():
        print(f"   - Column: {col}, Found: {dtype}, Expected: {expected_dtypes[col]}")
else:
    print("All Columns Have Correct Data Types.")


Data Type Mismatches Found:
   - Column: delivery_sk, Found: object, Expected: int64
   - Column: actual_quantity, Found: object, Expected: float64
   - Column: transportation_planning_priority, Found: object, Expected: int64
   - Column: material_sk, Found: object, Expected: int64
   - Column: material_code, Found: object, Expected: int64
   - Column: load_id, Found: object, Expected: int64
   - Column: origin_slot_arrival, Found: object, Expected: datetime64[ns]
   - Column: origin_slot_departure, Found: object, Expected: datetime64[ns]
   - Column: destination_slot_arrival, Found: object, Expected: datetime64[ns]
   - Column: destination_slot_departure, Found: object, Expected: datetime64[ns]
   - Column: RFRC_NUM12, Found: object, Expected: float64


#### Fixing data type issues 

In [109]:
# Fix Data Type Mismatches
df_planned_loads["delivery_sk"] = pd.to_numeric(df_planned_loads["delivery_sk"], errors="coerce")
df_planned_loads["actual_quantity"] = pd.to_numeric(df_planned_loads["actual_quantity"], errors="coerce")
df_planned_loads["transportation_planning_priority"] = pd.to_numeric(df_planned_loads["transportation_planning_priority"], errors="coerce")
df_planned_loads["material_sk"] = pd.to_numeric(df_planned_loads["material_sk"], errors="coerce")
df_planned_loads["material_code"] = pd.to_numeric(df_planned_loads["material_code"], errors="coerce")
df_planned_loads["load_id"] = pd.to_numeric(df_planned_loads["load_id"], errors="coerce")
df_planned_loads["RFRC_NUM12"] = pd.to_numeric(df_planned_loads["RFRC_NUM12"], errors="coerce")

# Convert datetime columns
datetime_cols = ["origin_slot_arrival", "origin_slot_departure", "destination_slot_arrival", "destination_slot_departure"]
for col in datetime_cols:
    df_planned_loads[col] = pd.to_datetime(df_planned_loads[col], errors="coerce")

print("Data Types Fixed.")


Data Types Fixed.


#### Duplicates

In [110]:
# Detect duplicate rows based on key columns
duplicate_rows = df_planned_loads.duplicated(
    subset=["RFRC_NUM12", "load_id", "movement_type", "transportation_planning_priority",
    "origin", "destination", "origin_slot_arrival", "origin_slot_departure",
    "destination_slot_arrival", "destination_slot_departure", "material_sk",
    "material_code", "unit_of_measure_code"],
    keep=False
)

duplicate_count = duplicate_rows.sum()
duplicate_percentage = (duplicate_count / len(df_planned_loads)) * 100

print(f"Duplicate Rows Found: {duplicate_count} ({duplicate_percentage:.2f}%)")

# Display a few duplicate rows for analysis
if duplicate_count > 0:
    display(df_planned_loads[duplicate_rows].head(10))
else:
    print("No duplicate rows found.")


Duplicate Rows Found: 303 (10.48%)


Unnamed: 0,delivery_sk,actual_quantity,unit_of_measure_code,plant_code,transportation_planning_priority,material_sk,material_code,load_id,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,RFRC_NUM12,origin,destination,movement_type
45,81701229,104.0,PC,GB02,1,28437,62651,34716845,2025-04-10 05:00:00,2025-04-10 05:30:00,NaT,NaT,21818005.0,GB02,29753937,SO
50,81692444,810.0,PC,GB01,1,51692,71219,34716459,2025-04-10 05:00:00,2025-04-10 05:30:00,NaT,NaT,21817158.0,GB01,29586298,SO
70,81710348,96.0,PC,GB01,1,68572,92223,34718004,2025-04-10 03:00:00,2025-04-10 03:30:00,NaT,NaT,21819290.0,GB01,40105060,SO
93,81668168,300.0,PC,GB02,1,1875675,102772,34715044,2025-04-10 06:00:00,2025-04-10 06:30:00,NaT,NaT,21813671.0,GB02,29585616,SO
98,81710499,120.0,PC,GB01,1,12111,15614,34718009,2025-04-10 05:00:00,2025-04-10 05:30:00,NaT,NaT,21819492.0,GB01,40105060,SO
105,81701229,1.0,PC,GB02,1,28437,62651,34716845,2025-04-10 05:00:00,2025-04-10 05:30:00,NaT,NaT,21818005.0,GB02,29753937,SO
110,81692444,450.0,PC,GB01,1,47482,71242,34716459,2025-04-10 05:00:00,2025-04-10 05:30:00,NaT,NaT,21817158.0,GB01,29586298,SO
117,81697118,210.0,PC,GB02,1,28437,62651,34716591,2025-04-10 03:00:00,2025-04-10 03:30:00,NaT,NaT,21817451.0,GB02,29585416,SO
137,81700882,1680.0,PC,GB28,1,73796,93980,34716737,2025-04-10 05:00:00,2025-04-10 06:00:00,NaT,NaT,21818415.0,GB28,29585508,SO
153,81668168,150.0,PC,GB02,1,1875675,102772,34715044,2025-04-10 06:00:00,2025-04-10 06:30:00,NaT,NaT,21813671.0,GB02,29585616,SO


#### Aggregation using actual quantity

In [111]:
# Aggregate sum of actual quantity based on specified columns
aggregation_columns = [
    "RFRC_NUM12", "load_id", "movement_type", "transportation_planning_priority",
    "origin", "destination", "origin_slot_arrival", "origin_slot_departure",
    "destination_slot_arrival", "destination_slot_departure", "material_sk",
    "material_code","plant_code","unit_of_measure_code"
]

df_planned_loads = df_planned_loads.groupby(aggregation_columns, as_index=False).agg({
    "actual_quantity": "sum"
})

# Display confirmation
print(f"Aggregation completed. Remaining rows: {len(df_planned_loads)}")
display(df_planned_loads.head(10))


Aggregation completed. Remaining rows: 195


Unnamed: 0,RFRC_NUM12,load_id,movement_type,transportation_planning_priority,origin,destination,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,material_sk,material_code,plant_code,unit_of_measure_code,actual_quantity
0,4508588000.0,34707610,STO,12,GB02,GB01,2025-04-10 22:30:00,2025-04-10 23:00:00,2025-04-11 03:00:00,2025-04-11 03:30:00,1009912,51399758,GB02,KG,736.0
1,4508593000.0,34709785,STO,13,GB28,CH01,2025-04-11 08:00:00,2025-04-11 09:00:00,2025-04-15 09:00:00,2025-04-15 10:00:00,1873371,107643,GB28,PC,2340.0
2,4508593000.0,34709786,STO,13,GB28,CH01,2025-04-10 11:00:00,2025-04-10 12:00:00,2025-04-14 07:00:00,2025-04-14 08:00:00,1873371,107643,GB28,PC,2340.0
3,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,36746,49799,GB01,PC,72.0
4,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1907157,107750,GB01,PC,1320.0
5,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1983159,99214,GB01,PC,1368.0
6,4508594000.0,34710214,STO,11,GB02,GB67,2025-04-10 12:00:00,2025-04-10 12:30:00,2025-04-10 14:00:00,2025-04-10 15:00:00,49997,62275,GB02,PC,432.0
7,4508594000.0,34710214,STO,11,GB02,GB67,2025-04-10 12:00:00,2025-04-10 12:30:00,2025-04-10 14:00:00,2025-04-10 15:00:00,1876455,102785,GB02,PC,1170.0
8,4508594000.0,34710214,STO,11,GB02,GB67,2025-04-10 12:00:00,2025-04-10 12:30:00,2025-04-10 14:00:00,2025-04-10 15:00:00,1895214,107437,GB02,PC,1000.0
9,4508594000.0,34710104,STO,12,GB01,GB02,2025-04-10 23:00:00,2025-04-10 23:30:00,2025-04-11 04:30:00,2025-04-11 05:00:00,36746,49799,GB01,PC,1512.0


#### Filtering based on 2 days from run date

In [112]:
from datetime import datetime, timedelta

# Define the run date (assuming today's date)
run_date = datetime.today()
# replacing that to midnight so that we wont miss till current time 
start_date = run_date.replace(hour=0, minute=0, second=0, microsecond=0)
end_date = start_date + timedelta(days=2)

# Apply filtering
df_planned_loads = df_planned_loads[
    (df_planned_loads["origin_slot_arrival"] >= start_date) &
    (df_planned_loads["origin_slot_arrival"] < end_date)
]

# Display results
print(f"Filter Applied: Kept {len(df_planned_loads)} rows out of {len(df_planned_loads)}.")
display(df_planned_loads.head(10))


Filter Applied: Kept 195 rows out of 195.


Unnamed: 0,RFRC_NUM12,load_id,movement_type,transportation_planning_priority,origin,destination,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,material_sk,material_code,plant_code,unit_of_measure_code,actual_quantity
0,4508588000.0,34707610,STO,12,GB02,GB01,2025-04-10 22:30:00,2025-04-10 23:00:00,2025-04-11 03:00:00,2025-04-11 03:30:00,1009912,51399758,GB02,KG,736.0
1,4508593000.0,34709785,STO,13,GB28,CH01,2025-04-11 08:00:00,2025-04-11 09:00:00,2025-04-15 09:00:00,2025-04-15 10:00:00,1873371,107643,GB28,PC,2340.0
2,4508593000.0,34709786,STO,13,GB28,CH01,2025-04-10 11:00:00,2025-04-10 12:00:00,2025-04-14 07:00:00,2025-04-14 08:00:00,1873371,107643,GB28,PC,2340.0
3,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,36746,49799,GB01,PC,72.0
4,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1907157,107750,GB01,PC,1320.0
5,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1983159,99214,GB01,PC,1368.0
6,4508594000.0,34710214,STO,11,GB02,GB67,2025-04-10 12:00:00,2025-04-10 12:30:00,2025-04-10 14:00:00,2025-04-10 15:00:00,49997,62275,GB02,PC,432.0
7,4508594000.0,34710214,STO,11,GB02,GB67,2025-04-10 12:00:00,2025-04-10 12:30:00,2025-04-10 14:00:00,2025-04-10 15:00:00,1876455,102785,GB02,PC,1170.0
8,4508594000.0,34710214,STO,11,GB02,GB67,2025-04-10 12:00:00,2025-04-10 12:30:00,2025-04-10 14:00:00,2025-04-10 15:00:00,1895214,107437,GB02,PC,1000.0
9,4508594000.0,34710104,STO,12,GB01,GB02,2025-04-10 23:00:00,2025-04-10 23:30:00,2025-04-11 04:30:00,2025-04-11 05:00:00,36746,49799,GB01,PC,1512.0


#### Unique unit measure code 

In [113]:
# Check unique unit_of_measure_code values
unique_uoms = df_planned_loads["unit_of_measure_code"].unique()

print("Unique unit_of_measure_code values before transformation:")
print(unique_uoms)


Unique unit_of_measure_code values before transformation:
['KG' 'PC']


#### Conversion of PC to HL

In [114]:
# Filter UOM table to retain only HL conversion factors
df_uom_filtered = df_uom_merged[df_uom_merged["target_unit_of_measure_code"] == "HL"]

# Merge with planned loads data
df_planned_loads = df_planned_loads.merge(
    df_uom_filtered[["material_sk", "PC_HL_Conversion"]],
    on="material_sk",
    how="left"
)

# Convert PC to HL only for unit_of_measure_code = "PC"
df_planned_loads["quantity_HL"] = df_planned_loads.apply(
    lambda row: row["actual_quantity"] * row["PC_HL_Conversion"]
    if row["unit_of_measure_code"] == "PC" else row["actual_quantity"],
    axis=1
)
# Drop the original "quantity" column and rename the transformed one
df_planned_loads.drop(columns=["actual_quantity"], inplace=True)
df_planned_loads.rename(columns={"quantity_HL": "total_quantity_HL"}, inplace=True)

print("Planned loads quantity successfully converted to HL.")


Planned loads quantity successfully converted to HL.


#### Filter based on valid actual quantity

In [115]:
initial_count = len(df_planned_loads)

# Apply the filter
df_planned_loads = df_planned_loads[df_planned_loads["total_quantity_HL"] > 1].reset_index(drop=True)
final_count = len(df_planned_loads)
retained_percentage = (final_count / initial_count) * 100
print(f"Filter Applied: Kept {final_count} rows ({retained_percentage:.2f}%) out of {initial_count}.")


Filter Applied: Kept 163 rows (83.59%) out of 195.


#### Drop rows where total quantity is null

In [116]:
missing_before = df_planned_loads["total_quantity_HL"].isnull().sum()
initial_count = len(df_planned_loads)

# Drop rows where `actual_quantity_HL` is null
df_planned_loads = df_planned_loads.dropna(subset=["total_quantity_HL"]).reset_index(drop=True)

# Detect missing values after dropping
missing_after = df_planned_loads["total_quantity_HL"].isnull().sum()
final_count = len(df_planned_loads)
retained_percentage = (final_count / initial_count) * 100
print(f"Missing values in `total_quantity_HL` before drop: {missing_before}")
print(f"Missing values after drop: {missing_after}")
print(f"Rows remaining after drop: {final_count} ({retained_percentage:.2f}% of original).")


Missing values in `total_quantity_HL` before drop: 0
Missing values after drop: 0
Rows remaining after drop: 163 (100.00% of original).


In [117]:
unique_uoms = df_planned_loads["movement_type"].unique()

print("Unique unit_of_measure_code values before transformation:")
print(unique_uoms)

Unique unit_of_measure_code values before transformation:
['STO']


#### Final saving preprocessing data to Planned loads 

In [118]:
planned_loads_path = os.path.join(preprocessed_path, "planned_loads.csv")

# Save the DataFrame
df_planned_loads.to_csv(planned_loads_path, index=False)

print(f"Processed Planned Loads Data Saved to: {planned_loads_path}")


Processed Planned Loads Data Saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/planned_loads.csv


#### Filtering for SO and Export movement types 

In [119]:
# Define dataset name
file_name = "input_Planned_loads.csv"

# Load the original dataset again
df_so_export_loads = dataframes[file_name]

# Define required columns
required_columns_so_export = [
    "delivery_sk", "actual_quantity", "unit_of_measure_code", "plant_code",
    "transportation_planning_priority", "material_sk", "material_code", "load_id",
    "origin_slot_arrival", "origin_slot_departure", "destination_slot_arrival",
    "destination_slot_departure", "RFRC_NUM12", "origin", "destination", "movement_type"
]

# Retain only required columns
df_so_export_loads = df_so_export_loads[required_columns_so_export]

# Apply Filter: Keep only "SO" & "Export"
df_so_export_loads = df_so_export_loads[df_so_export_loads["movement_type"].isin(["SO", "Export"])]

# Store updated DataFrame
dataframes[file_name] = df_so_export_loads
print(f"Filtered movement_type (SO & Export) in {file_name}:")
print(df_so_export_loads["movement_type"].unique())  # Verify unique values after filtering

# Show first few rows
df_so_export_loads.head()


Filtered movement_type (SO & Export) in input_Planned_loads.csv:
['SO' 'Export']


Unnamed: 0,delivery_sk,actual_quantity,unit_of_measure_code,plant_code,transportation_planning_priority,material_sk,material_code,load_id,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,RFRC_NUM12,origin,destination,movement_type
0,81728721,4680.0,PC,GB01,1,1977870,99150,34718714,2025-04-10 08:00:00,2025-04-10 08:30:00,NaT,NaT,21820530.0,GB01,29585614,SO
1,81649724,26.0,PC,GB67,1,1869080,108741,34714811,2025-04-10 16:00:00,2025-04-10 17:00:00,NaT,NaT,21806479.0,GB67,29586301,SO
2,81697215,450.0,PC,GB02,1,1978473,99148,34716589,2025-04-10 15:00:00,2025-04-10 15:30:00,NaT,NaT,21816790.0,GB02,29586299,SO
3,81687441,195.0,PC,GB01,0,1877163,105485,34716430,2025-04-10 17:00:00,2025-04-10 17:30:00,NaT,NaT,21809360.0,GB01,29586017,SO
5,81649740,26.0,PC,GB67,1,1869080,108741,34714812,2025-04-10 09:00:00,2025-04-10 10:00:00,NaT,NaT,21806566.0,GB67,29585964,SO


#### Aggregation

In [120]:
# Perform aggregation based on delivery date (origin_slot_arrival)
df_open_so_out = df_so_export_loads.groupby(
    ["origin_slot_arrival", "plant_code", "material_sk", "material_code","unit_of_measure_code"],
    as_index=False
).agg(
    {"actual_quantity": "sum"}
)

# Rename aggregated column
df_open_so_out.rename(columns={"actual_quantity": "open_so_out(HL)"}, inplace=True)

# Display results
print(f"Aggregation completed. Remaining rows: {len(df_open_so_out)}")
df_open_so_out.head()


Aggregation completed. Remaining rows: 2127


Unnamed: 0,origin_slot_arrival,plant_code,material_sk,material_code,unit_of_measure_code,open_so_out(HL)
0,2025-04-10,GB01,4333,11001,PC,4.0
1,2025-04-10,GB01,9054,15665,PC,26.0
2,2025-04-10,GB01,10093,15519,PC,128.0
3,2025-04-10,GB01,10383,15463,PC,53.0
4,2025-04-10,GB01,12111,15614,PC,64.0


#### Transformation of PC to HL

In [121]:
# Filter UOM table to retain only HL conversion factors
df_uom_filtered = df_uom_merged[df_uom_merged["target_unit_of_measure_code"] == "HL"]

# Merge with planned loads data
df_open_so_out = df_open_so_out.merge(
    df_uom_filtered[["material_sk","PC_HL_Conversion"]],
    on="material_sk",
    how="left"
)

# Convert PC to HL only for unit_of_measure_code = "PC"
df_open_so_out["total_quantity_HL"] = df_open_so_out.apply(
    lambda row: row["open_so_out(HL)"] * row["PC_HL_Conversion"]
    if row["unit_of_measure_code"] == "PC" else row["open_so_out(HL)"],
    axis=1
)
# Drop the original "quantity" column and rename the transformed one
df_open_so_out.drop(columns=["open_so_out(HL)"], inplace=True)
df_open_so_out.rename(columns={"total_quantity_HL": "open_so_out(HL)"}, inplace=True)

print("Planned loads quantity successfully converted to HL.")


Planned loads quantity successfully converted to HL.


#### Filter based on valid quantity

In [122]:
initial_count = len(df_open_so_out)

# Apply the filter
df_open_so_out = df_open_so_out[df_open_so_out["open_so_out(HL)"] > 1].reset_index(drop=True)
final_count = len(df_open_so_out)
retained_percentage = (final_count / initial_count) * 100
print(f"Filter Applied: Kept {final_count} rows ({retained_percentage:.2f}%) out of {initial_count}.")


Filter Applied: Kept 1736 rows (81.62%) out of 2127.


#### Filtering missing values 

In [123]:
missing_before = df_open_so_out["open_so_out(HL)"].isnull().sum()
initial_count = len(df_open_so_out)

# Drop rows where `actual_quantity_HL` is null
df_open_so_out = df_open_so_out.dropna(subset=["open_so_out(HL)"]).reset_index(drop=True)

# Detect missing values after dropping
missing_after = df_open_so_out["open_so_out(HL)"].isnull().sum()
final_count = len(df_open_so_out)
retained_percentage = (final_count / initial_count) * 100
print(f"Missing values in `open_so_out(HL)` before drop: {missing_before}")
print(f"Missing values after drop: {missing_after}")
print(f"Rows remaining after drop: {final_count} ({retained_percentage:.2f}% of original).")


Missing values in `open_so_out(HL)` before drop: 0
Missing values after drop: 0
Rows remaining after drop: 1736 (100.00% of original).


#### Final saving of open so 

In [124]:
df_open_so_out_path = os.path.join(preprocessed_path, "open_so.csv")

# Save the DataFrame
df_open_so_out.to_csv(df_open_so_out_path, index=False)

print(f"Processed SO & Export Data Saved to: {df_open_so_out_path}")


Processed SO & Export Data Saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/open_so.csv


#### Filter STO loads only

In [125]:
# Apply filter: Keep only rows where movement_type = 'STO'
df_sto_loads = df_planned_loads[df_planned_loads["movement_type"] == "STO"].reset_index(drop=True)

# Display result
print(f"Filter applied: Kept {len(df_sto_loads)} rows with movement_type = 'STO' out of {len(df_planned_loads)}.")
df_sto_loads.head()


Filter applied: Kept 163 rows with movement_type = 'STO' out of 163.


Unnamed: 0,RFRC_NUM12,load_id,movement_type,transportation_planning_priority,origin,destination,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,material_sk,material_code,plant_code,unit_of_measure_code,PC_HL_Conversion,total_quantity_HL
0,4508588000.0,34707610,STO,12,GB02,GB01,2025-04-10 22:30:00,2025-04-10 23:00:00,2025-04-11 03:00:00,2025-04-11 03:30:00,1009912,51399758,GB02,KG,,736.0
1,4508593000.0,34709785,STO,13,GB28,CH01,2025-04-11 08:00:00,2025-04-11 09:00:00,2025-04-15 09:00:00,2025-04-15 10:00:00,1873371,107643,GB28,PC,0.0594,138.996
2,4508593000.0,34709786,STO,13,GB28,CH01,2025-04-10 11:00:00,2025-04-10 12:00:00,2025-04-14 07:00:00,2025-04-14 08:00:00,1873371,107643,GB28,PC,0.0594,138.996
3,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,36746,49799,GB01,PC,0.0792,5.7024
4,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1907157,107750,GB01,PC,0.0528,69.696


#### Merging STO with uom for getting pal conversions  

In [126]:
# Extract PC_PAL_Conversion & pal_weight_kg where target_unit_of_measure_code is "PAL"
df_uom_pal = df_uom_merged[df_uom_merged["target_unit_of_measure_code"] == "PAL"][
    ["material_sk", "PC_HL_Conversion", "pal_weight_kg"]
].rename(columns={"PC_HL_Conversion": "PC_PAL_Conversion"})  # Rename to PC_PAL_Conversion

# Merge STO loads with df_uom_pal to include PC_PAL_Conversion & pal_weight_kg
df_sto_merged = df_sto_loads.merge(
    df_uom_pal,
    on="material_sk",
    how="left"
)

# Display results
print(f"Merge completed. Final row count: {len(df_sto_merged)}")
df_sto_merged.head()


Merge completed. Final row count: 163




Unnamed: 0,RFRC_NUM12,load_id,movement_type,transportation_planning_priority,origin,destination,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,material_sk,material_code,plant_code,unit_of_measure_code,PC_HL_Conversion,total_quantity_HL,PC_PAL_Conversion,pal_weight_kg
0,4508588000.0,34707610,STO,12,GB02,GB01,2025-04-10 22:30:00,2025-04-10 23:00:00,2025-04-11 03:00:00,2025-04-11 03:30:00,1009912,51399758,GB02,KG,,736.0,,
1,4508593000.0,34709785,STO,13,GB28,CH01,2025-04-11 08:00:00,2025-04-11 09:00:00,2025-04-15 09:00:00,2025-04-15 10:00:00,1873371,107643,GB28,PC,0.0594,138.996,0.011111,950.5
2,4508593000.0,34709786,STO,13,GB28,CH01,2025-04-10 11:00:00,2025-04-10 12:00:00,2025-04-14 07:00:00,2025-04-14 08:00:00,1873371,107643,GB28,PC,0.0594,138.996,0.011111,950.5
3,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,36746,49799,GB01,PC,0.0792,5.7024,0.013889,1026.64
4,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1907157,107750,GB01,PC,0.0528,69.696,0.004545,1292.56


#### Calculating total quantity PAL

In [127]:
# Compute Total Quantity PAL
df_sto_merged["total_quantity_PAL"] = (df_sto_merged["total_quantity_HL"] / df_sto_merged["PC_HL_Conversion"]) * df_sto_merged["PC_PAL_Conversion"]

# Display results
print("Transformation Completed: Total Quantity PAL Calculated")
df_sto_merged.head()


Transformation Completed: Total Quantity PAL Calculated


Unnamed: 0,RFRC_NUM12,load_id,movement_type,transportation_planning_priority,origin,destination,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,material_sk,material_code,plant_code,unit_of_measure_code,PC_HL_Conversion,total_quantity_HL,PC_PAL_Conversion,pal_weight_kg,total_quantity_PAL
0,4508588000.0,34707610,STO,12,GB02,GB01,2025-04-10 22:30:00,2025-04-10 23:00:00,2025-04-11 03:00:00,2025-04-11 03:30:00,1009912,51399758,GB02,KG,,736.0,,,
1,4508593000.0,34709785,STO,13,GB28,CH01,2025-04-11 08:00:00,2025-04-11 09:00:00,2025-04-15 09:00:00,2025-04-15 10:00:00,1873371,107643,GB28,PC,0.0594,138.996,0.011111,950.5,26.0
2,4508593000.0,34709786,STO,13,GB28,CH01,2025-04-10 11:00:00,2025-04-10 12:00:00,2025-04-14 07:00:00,2025-04-14 08:00:00,1873371,107643,GB28,PC,0.0594,138.996,0.011111,950.5,26.0
3,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,36746,49799,GB01,PC,0.0792,5.7024,0.013889,1026.64,1.0
4,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1907157,107750,GB01,PC,0.0528,69.696,0.004545,1292.56,6.0


#### Calculating Total Quantity KG

In [128]:
# Compute Total Quantity KG
df_sto_merged["total_quantity_KG"] = df_sto_merged["total_quantity_PAL"] * df_sto_merged["pal_weight_kg"]

# Display results
print("Transformation Completed: Total Quantity KG Calculated")
df_sto_merged.head()


Transformation Completed: Total Quantity KG Calculated


Unnamed: 0,RFRC_NUM12,load_id,movement_type,transportation_planning_priority,origin,destination,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,material_sk,material_code,plant_code,unit_of_measure_code,PC_HL_Conversion,total_quantity_HL,PC_PAL_Conversion,pal_weight_kg,total_quantity_PAL,total_quantity_KG
0,4508588000.0,34707610,STO,12,GB02,GB01,2025-04-10 22:30:00,2025-04-10 23:00:00,2025-04-11 03:00:00,2025-04-11 03:30:00,1009912,51399758,GB02,KG,,736.0,,,,
1,4508593000.0,34709785,STO,13,GB28,CH01,2025-04-11 08:00:00,2025-04-11 09:00:00,2025-04-15 09:00:00,2025-04-15 10:00:00,1873371,107643,GB28,PC,0.0594,138.996,0.011111,950.5,26.0,24713.0
2,4508593000.0,34709786,STO,13,GB28,CH01,2025-04-10 11:00:00,2025-04-10 12:00:00,2025-04-14 07:00:00,2025-04-14 08:00:00,1873371,107643,GB28,PC,0.0594,138.996,0.011111,950.5,26.0,24713.0
3,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,36746,49799,GB01,PC,0.0792,5.7024,0.013889,1026.64,1.0,1026.64
4,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1907157,107750,GB01,PC,0.0528,69.696,0.004545,1292.56,6.0,7755.36


#### Drop Null values based on total quantity pal

In [129]:
# Detect missing values before dropping
missing_before = df_sto_merged["total_quantity_PAL"].isnull().sum()
total_rows_before = len(df_sto_merged)

# Drop rows where `total_quantity_PAL` is null
df_sto_merged = df_sto_merged.dropna(subset=["total_quantity_PAL"]).reset_index(drop=True)

# Detect missing values after dropping
missing_after = df_sto_merged["total_quantity_PAL"].isnull().sum()
total_rows_after = len(df_sto_merged)

# Compute statistics
rows_dropped = missing_before
drop_percentage = (rows_dropped / total_rows_before) * 100

# Display results
print(f"Missing values in `total_quantity_PAL` before drop: {missing_before}")
print(f"Missing values after drop: {missing_after}")
print(f"Rows remaining after drop: {total_rows_after} ({100 - drop_percentage:.2f}% retained)")
print(f"Rows dropped: {rows_dropped} ({drop_percentage:.2f}%)")


Missing values in `total_quantity_PAL` before drop: 1
Missing values after drop: 0
Rows remaining after drop: 162 (99.39% retained)
Rows dropped: 1 (0.61%)


#### Final saving to Open STO

In [130]:
# Define save path
open_sto_path = os.path.join(preprocessed_path, "open_sto.csv")

# Save the DataFrame
df_sto_merged.to_csv(open_sto_path, index=False)

print(f"Processed STO Data Saved to: {open_sto_path}")


Processed STO Data Saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/open_sto.csv


#### Aggregation

In [131]:
# Perform aggregation on the available columns
df_aggregated = df_sto_merged.groupby(
    [
        "RFRC_NUM12", "load_id", "movement_type", "transportation_planning_priority",
        "origin", "destination", "origin_slot_arrival", "origin_slot_departure",
        "destination_slot_arrival", "destination_slot_departure", "material_sk", "material_code", "plant_code"
    ],
    as_index=False
).agg(
    {
        "total_quantity_HL": "sum",
        "total_quantity_PAL": "sum",
        "total_quantity_KG": "sum"
    }
)

# Display results
print(f"Aggregation completed. Remaining rows: {len(df_aggregated)}")
df_aggregated.head()


Aggregation completed. Remaining rows: 162




Unnamed: 0,RFRC_NUM12,load_id,movement_type,transportation_planning_priority,origin,destination,origin_slot_arrival,origin_slot_departure,destination_slot_arrival,destination_slot_departure,material_sk,material_code,plant_code,total_quantity_HL,total_quantity_PAL,total_quantity_KG
0,4508593000.0,34709785,STO,13,GB28,CH01,2025-04-11 08:00:00,2025-04-11 09:00:00,2025-04-15 09:00:00,2025-04-15 10:00:00,1873371,107643,GB28,138.996,26.0,24713.0
1,4508593000.0,34709786,STO,13,GB28,CH01,2025-04-10 11:00:00,2025-04-10 12:00:00,2025-04-14 07:00:00,2025-04-14 08:00:00,1873371,107643,GB28,138.996,26.0,24713.0
2,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,36746,49799,GB01,5.7024,1.0,1026.64
3,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1907157,107750,GB01,69.696,6.0,7755.36
4,4508594000.0,34710108,STO,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00,2025-04-11 22:00:00,2025-04-11 22:30:00,1983159,99214,GB01,108.3456,19.0,19273.6


#### Final saving aggregated df to load detials

In [132]:
# Define save path
load_details_path = os.path.join(preprocessed_path, "load_details.csv")

# Save to CSV
df_aggregated.to_csv(load_details_path, index=False)

print(f"Processed Load Details Data Saved to: {load_details_path}")


Processed Load Details Data Saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/load_details.csv


#### Save a column subset to outbound loads

In [133]:
# Define required columns for outbound loads
required_columns = [
    "RFRC_NUM12", "load_id", "movement_type", "plant_code",
    "transportation_planning_priority", "origin", "destination",
    "origin_slot_arrival", "origin_slot_departure"
]

# Subset the DataFrame
df_outbound_loads = df_aggregated[required_columns].copy()

# Define save path
outbound_loads_path = os.path.join(preprocessed_path, "outbound_loads.csv")

# Save to CSV
df_outbound_loads.to_csv(outbound_loads_path, index=False)

# Display results
print(f"Column subset applied and saved to: {outbound_loads_path}")
df_outbound_loads.head()


Column subset applied and saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/outbound_loads.csv


Unnamed: 0,RFRC_NUM12,load_id,movement_type,plant_code,transportation_planning_priority,origin,destination,origin_slot_arrival,origin_slot_departure
0,4508593000.0,34709785,STO,GB28,13,GB28,CH01,2025-04-11 08:00:00,2025-04-11 09:00:00
1,4508593000.0,34709786,STO,GB28,13,GB28,CH01,2025-04-10 11:00:00,2025-04-10 12:00:00
2,4508594000.0,34710108,STO,GB01,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00
3,4508594000.0,34710108,STO,GB01,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00
4,4508594000.0,34710108,STO,GB01,12,GB01,GB02,2025-04-11 17:00:00,2025-04-11 17:30:00


#### Save a column subset to inbound loads

In [134]:
# Define required columns for outbound loads
required_columns = [
    "RFRC_NUM12", "load_id", "movement_type", "plant_code",
    "transportation_planning_priority", "origin", "destination",
    "destination_slot_arrival", "destination_slot_departure"
]

# Subset the DataFrame
df_inbound_loads = df_aggregated[required_columns].copy()

# Define save path
inbound_loads_path = os.path.join(preprocessed_path, "inbound_loads.csv")

# Save to CSV
df_inbound_loads.to_csv(inbound_loads_path, index=False)

# Display results
print(f"Column subset applied and saved to: {inbound_loads_path}")
df_inbound_loads.head()


Column subset applied and saved to: ./PRE_PROCESSED_DATA/04. April 2025/10_04_2025 preprocessed data/2025_04_10_13_03_07 preprocessed data/inbound_loads.csv


Unnamed: 0,RFRC_NUM12,load_id,movement_type,plant_code,transportation_planning_priority,origin,destination,destination_slot_arrival,destination_slot_departure
0,4508593000.0,34709785,STO,GB28,13,GB28,CH01,2025-04-15 09:00:00,2025-04-15 10:00:00
1,4508593000.0,34709786,STO,GB28,13,GB28,CH01,2025-04-14 07:00:00,2025-04-14 08:00:00
2,4508594000.0,34710108,STO,GB01,12,GB01,GB02,2025-04-11 22:00:00,2025-04-11 22:30:00
3,4508594000.0,34710108,STO,GB01,12,GB01,GB02,2025-04-11 22:00:00,2025-04-11 22:30:00
4,4508594000.0,34710108,STO,GB01,12,GB01,GB02,2025-04-11 22:00:00,2025-04-11 22:30:00
