In [1]:
# In Terminal, "pip install ibis-framework[duckdb] pyjanitor"
import pandas as pd
from janitor import clean_names

In [2]:
# Path
from pathlib import Path
path = Path("~/datasets/home-dataset/jupyterlab/ZMPV")
data_path = path / "data"
db_path = path / "db"

In [3]:
# Input files
input_file = data_path / "ZMPV_2024_01.txt"

In [4]:
# DB files
db_file = db_path / "ZMPV_2024.csv"

In [5]:
def read_txt_file(path):
    # Read a tab-delimited file

    return pd.read_csv(
        path,
        delimiter="\t",
        skiprows=6,
        encoding="UTF-16LE",
        skipinitialspace=True,
        thousands=",",
        # engine='python'  # Error occurs
        dtype={
            "M/Y (from-": str,
            "SAP Plant": str,
            "Outlet": str,
            "Vendor": str,
            "Trading Pr": str,
            "Accounts f": str,
            "Document d": str,
        },
    )

In [6]:
def remove_col_row(df):
    # Remove first two columns and sub-total rows
    df = df.drop(columns=["Unnamed: 0", "Unnamed: 1"])
    df = df.dropna(subset="Profit Cen")
    return df

In [7]:
def rename_cols(df):
    new_columns = {
        "Net PPV...23": "Net PPV",
        "Net PM_PPV...24": "Net PPV ratio",
        "STD Other...39": "STD Other",
        "STD Other...40": "STD Other 2",
    }
    return df.rename(columns=new_columns)

In [8]:
df = read_txt_file(input_file)
df.head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,M/Y (from-,Outs./IC,SAP Plant,Profit Cen,Outlet,Vendor,Vendor Name,Vendor Cou,...,PPV accoun,IC.Elimin.,Subcontrac,Actual Cus,Accounts f,Materia Ty,Psegment,Document d,MRP Type,MRP Descrp
0,,,2025.001,IC,180,50803-009,7311,0VCC208000,Vitesco Technologies GmbH,DE,...,K9021,0,0,0,,ROH,208-7311,,X1,"APO:MRP(P1, ROH)"
1,,,2025.001,IC,180,50803-009,7311,0VCC208000,Vitesco Technologies GmbH,DE,...,K90211,-3826252,0,0,,ROH,208-7311,,X1,"APO:MRP(P1, ROH)"
2,,,2025.001,IC,180,50803-009,7311,0VCC208000,Vitesco Technologies GmbH,DE,...,K90211,-7652486,0,0,,ROH,208-7311,,X1,"APO:MRP(P1, ROH)"


In [9]:
df = remove_col_row(df)
df.head(3)

Unnamed: 0,M/Y (from-,Outs./IC,SAP Plant,Profit Cen,Outlet,Vendor,Vendor Name,Vendor Cou,Trading Pr,TradingPrtnDescr,...,PPV accoun,IC.Elimin.,Subcontrac,Actual Cus,Accounts f,Materia Ty,Psegment,Document d,MRP Type,MRP Descrp
0,2025.001,IC,180,50803-009,7311,0VCC208000,Vitesco Technologies GmbH,DE,2080,VITESCO TECHNOLOGIES GMBH,...,K9021,0,0,0,,ROH,208-7311,,X1,"APO:MRP(P1, ROH)"
1,2025.001,IC,180,50803-009,7311,0VCC208000,Vitesco Technologies GmbH,DE,2080,VITESCO TECHNOLOGIES GMBH,...,K90211,-3826252,0,0,,ROH,208-7311,,X1,"APO:MRP(P1, ROH)"
2,2025.001,IC,180,50803-009,7311,0VCC208000,Vitesco Technologies GmbH,DE,2080,VITESCO TECHNOLOGIES GMBH,...,K90211,-7652486,0,0,,ROH,208-7311,,X1,"APO:MRP(P1, ROH)"


In [10]:
df = rename_cols(df)
df = clean_names(df)
df.head(3)

Unnamed: 0,m_y_from_,outs_ic,sap_plant,profit_cen,outlet,vendor,vendor_name,vendor_cou,trading_pr,tradingprtndescr,...,ppv_accoun,ic_elimin_,subcontrac,actual_cus,accounts_f,materia_ty,psegment,document_d,mrp_type,mrp_descrp
0,2025.001,IC,180,50803-009,7311,0VCC208000,Vitesco Technologies GmbH,DE,2080,VITESCO TECHNOLOGIES GMBH,...,K9021,0,0,0,,ROH,208-7311,,X1,"APO:MRP(P1, ROH)"
1,2025.001,IC,180,50803-009,7311,0VCC208000,Vitesco Technologies GmbH,DE,2080,VITESCO TECHNOLOGIES GMBH,...,K90211,-3826252,0,0,,ROH,208-7311,,X1,"APO:MRP(P1, ROH)"
2,2025.001,IC,180,50803-009,7311,0VCC208000,Vitesco Technologies GmbH,DE,2080,VITESCO TECHNOLOGIES GMBH,...,K90211,-7652486,0,0,,ROH,208-7311,,X1,"APO:MRP(P1, ROH)"


In [11]:
df.to_csv(db_file, mode="a", header=False, index=False)