In [1]:
# In Terminal, "pip install ibis-framework[duckdb] pyjanitor"
import pandas as pd
import ibis
from ibis import selectors as s
from ibis import _
ibis.options.interactive = True

In [2]:
# Path
from pathlib import Path
path = Path("~/datasets/home-dataset/jupyterlab/ZX05")
data_path = path / "data"
db_path = path / "db"

In [3]:
# Year / Month
year = "2024"
month = "02"  # Monthly to be updated

In [4]:
# Input files
input_file_cf = data_path / "CF_2024_02.dat"  # Monthly to be updated
input_file_pl = data_path / "PL_2024_02.dat"  # Monthly to be updated

In [5]:
# DB files
db_file_cf = db_path / "CF_2024.csv"
db_file_pl = db_path / "PL_2024.csv"

## Add data to Database

In [6]:
def read_txt_file(path):
    df = pd.read_csv(path, sep="\t", usecols=[0, 1, 2, 3])
    # rename columns
    df = df.rename(columns={"Cost center": "text_col",
                            "Act": "actual",
                            "Plan":"plan",
                            "Tgt": "target"})
    return df

In [7]:
# Extract Cost center and GL accounts using regex
def extract_text(df):
    df = df.assign(
        costctr=df["text_col"].str.extract(r"(^[0-9]{4,5}|^IC-.{4,5}|^CY-.{4,5}|^DUMMY_.{3})"),  # ICH-.{4,5}|
        gl_accounts=df["text_col"].str.extract(r"(^K[0-9]+|^S[0-9]+)"),
    )
    df["costctr"] = df["costctr"].str.strip()
    # Fill in missing values for CostCtr
    df["costctr"] = df["costctr"].bfill()  # .fillna(method="backfill")
    return df

In [8]:
def ibis_wrangling(df):
    df = (df
        # add columns
        .mutate(fy = year, period = month)
        # change data type from string to integer
        .mutate(fy = _.fy.cast("int"), period = _.period.cast("int"))
        # reorder columns
        .select("fy", "period", "costctr", "gl_accounts", "actual", "plan", "target")
        # filter out missing values
        .filter(_.gl_accounts != ibis.NA)
        # filter out rows with Actual, Plan and Target all equal to 0
        .filter(~((_.actual == 0) & (_.plan == 0) & (_.target == 0)))
    )
    return df

In [9]:
# Central functions
df = read_txt_file(input_file_cf)
df.head(3)

Unnamed: 0,text_col,actual,plan,target
0,K082 Catering material,20060,0,0
1,K30001 Salary,22030000,21880000,21880000
2,K30002 Overtime salaries,238100,748516,748516


In [10]:
df = extract_text(df)
df.head(3)

Unnamed: 0,text_col,actual,plan,target,costctr,gl_accounts
0,K082 Catering material,20060,0,0,4110,K082
1,K30001 Salary,22030000,21880000,21880000,4110,K30001
2,K30002 Overtime salaries,238100,748516,748516,4110,K30002


In [11]:
cf = ibis.memtable(df)
cf = ibis_wrangling(cf)
cf.head(3)

In [12]:
# Productlines
df = read_txt_file(input_file_pl)
df.head(3)

Unnamed: 0,text_col,actual,plan,target
0,K66270 Allocation Material Management,27915074,27584485,27584485
1,K66271 Allocation Production,190384362,188129602,188129602
2,K66273 Allocation S&D: Selling,117722855,116328686,116328686


In [13]:
df = extract_text(df)
df.head(3)

Unnamed: 0,text_col,actual,plan,target,costctr,gl_accounts
0,K66270 Allocation Material Management,27915074,27584485,27584485,IC-99LD,K66270
1,K66271 Allocation Production,190384362,188129602,188129602,IC-99LD,K66271
2,K66273 Allocation S&D: Selling,117722855,116328686,116328686,IC-99LD,K66273


In [14]:
pl = ibis.memtable(df)
pl = ibis_wrangling(pl)
pl.head(3)

In [15]:
# Output data
cf.to_pandas().to_csv(db_file_cf, mode="a", header=False, index=False)
pl.to_pandas().to_csv(db_file_pl, mode="a", header=False, index=False)