In [1]:
# In Terminal, "pip install ibis-framework[duckdb] pyjanitor"
import pandas as pd
from janitor import clean_names

In [2]:
# Path
from pathlib import Path
path = Path("~/datasets/home-dataset/jupyterlab/ZVAR")
data_path = path / "data"
db_path = path / "db"

In [3]:
# Input files
input_file = data_path / "ZVAR_2023_11.txt"

In [4]:
# DB files
db_file = db_path / "ZVAR_2023.csv"

In [5]:
def read_txt_file(path):
    # Read a tab-delimited file
    return pd.read_csv(
        path,
        sep="\t",
        skiprows=8,
        encoding="UTF-16LE",
        skipinitialspace=True,
        thousands=",",
        engine="python",
        dtype={
            "Order": str,
            "Cost Ctr": str,
            "Blk-Ind": str,
            "No-Post": str,
        },           
    )

In [6]:
def remove_col_row(df):
    # Remove first two columns and sub-total rows
    df = df.drop(columns=["Unnamed: 0", "Unnamed: 1", "Unnamed: 4"])
    df = df.dropna(subset=["Order"])
    # Remove repeating header rows
    df = df[df.ne(df.columns).any(axis="columns")]
    return df

In [7]:
df = read_txt_file(input_file)
df.head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Order,Material,Unnamed: 4,Cost Elem.,CE act,COmaterial,BTran,Cost Ctr,...,COAr,Resp. CCtr,Crcy,Object number,Prod.Proc,Le,Vsn,CO subkey,Partner Object,Source Object
0,,,756325,1802CF8T,,K020,,A2C00013954,,,...,5668,,KRW,OR000000756325,100366251,0,0,,,
1,,,756325,1802CF8T,,K020,,A2C81641700,,,...,5668,,KRW,OR000000756325,100366251,0,0,,,
2,,,756325,1802CF8T,,K020,,A2C93346100,,,...,5668,,KRW,OR000000756325,100366251,0,0,,,


In [8]:
df = remove_col_row(df)
df.head(3)

Unnamed: 0,Order,Material,Cost Elem.,CE act,COmaterial,BTran,Cost Ctr,ActTyp,D/C,UM,...,COAr,Resp. CCtr,Crcy,Object number,Prod.Proc,Le,Vsn,CO subkey,Partner Object,Source Object
0,756325,1802CF8T,K020,,A2C00013954,,,,D,ST,...,5668,,KRW,OR000000756325,100366251,0,0,,,
1,756325,1802CF8T,K020,,A2C81641700,,,,D,ST,...,5668,,KRW,OR000000756325,100366251,0,0,,,
2,756325,1802CF8T,K020,,A2C93346100,,,,D,ST,...,5668,,KRW,OR000000756325,100366251,0,0,,,


In [9]:
df = clean_names(df)
df.head(3)

Unnamed: 0,order,material,cost_elem_,ce_act,comaterial,btran,cost_ctr,acttyp,d_c,um,...,coar,resp_cctr,crcy,object_number,prod_proc,le,vsn,co_subkey,partner_object,source_object
0,756325,1802CF8T,K020,,A2C00013954,,,,D,ST,...,5668,,KRW,OR000000756325,100366251,0,0,,,
1,756325,1802CF8T,K020,,A2C81641700,,,,D,ST,...,5668,,KRW,OR000000756325,100366251,0,0,,,
2,756325,1802CF8T,K020,,A2C93346100,,,,D,ST,...,5668,,KRW,OR000000756325,100366251,0,0,,,


In [10]:
df.to_csv(db_file, mode="a", header=False, index=False)