In [None]:
import pandas as pd
import os

# Folder and output setup
data_folder = 'data/'
output_file = 'mergedportail.xlsx'

# ✅ List of columns you want to keep
columns_to_keep = [
    "order_oc_id", "customer_id", "project_id", "country_of_delivery", "supply_center_id",
    "order_type", "direct_delivery", "warehouse_id", "country_of_origin", "order_priority_type",
    "order_id", "order_line_id", "product_id", "version_id", "product_name", "product_type",
    "product_group", "product_family", "product_cat", "product_batch_no", "shelf_life",
    "expiry_date", "packing_id", "packing_line_id", "shipment_id", "transaction_type",
    "dispatch_address", "product_assortment", "unit_price", "unit_price_assortment",
    "quantity_ordered", "quantity_packed", "quantity_received", "quantity_invoiced",
    "invoice_id", "invoice_date", "volume", "weight", "unit_volume", "unit_weight", "confirmed_amount", "invoiced_amount",
    "actual_delivery_date", "accounting_code", "order_description", "order_completion", "Total_LeadTime",
    "order_weight_kg", "order_volume_dm3", "order_volume_m3", "price_orderline",
    "unique_order_code", "unique_shipment_code", "unique_backorder_code"
]

# Initialize the DataFrame list
df_list = []

# Loop through files (skip the output file if re-running)
for file in os.listdir(data_folder):
    if file.endswith('.xlsx') and file != output_file:
        file_path = os.path.join(data_folder, file)
        print(f"Reading: {file_path}")
        df = pd.read_excel(file_path, usecols=columns_to_keep, engine='openpyxl')  # 🚀 Speed up with explicit engine
        df_list.append(df)

# Merge all filtered DataFrames
if df_list:
    merged_df = pd.concat(df_list, ignore_index=True)
    print(f"Merged DataFrame shape: {merged_df.shape}")
else:
    merged_df = pd.DataFrame()
    print("No data found to merge.")

# ✅ (Optional) Save if needed
# merged_df.to_excel(os.path.join(data_folder, output_file), index=False)

merged_df.head()



Reading: data/Cleaned_Portail_Part_1.xlsx
Reading: data/Cleaned_Portail_Part_2.xlsx
Reading: data/Cleaned_Portail_Part_3.xlsx
Reading: data/Cleaned_Portail_Part_4.xlsx
Reading: data/Cleaned_Portail_Part_5.xlsx
Reading: data/Cleaned_Portail_Part_6.xlsx
Reading: data/Cleaned_Portail_Part_7.xlsx
Merged DataFrame shape: (262636, 54)
