In [1]:
 # In Terminal, "pip install ibis-framework[duckdb] pyjanitor"
import pandas as pd
import ibis
from ibis import selectors as s
from ibis import _
ibis.options.interactive = True

In [2]:
# Path
from pathlib import Path
path = Path("/home/uiv17345/datasets/home-dataset/jupyterlab/PnL")
data_path = path / "data" / "SAP YGL4"
output_path = path / "output"

In [3]:
data_path

PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL4')

### Input data: List of multiple text files

In [4]:
# Input data: List of multiple text files
dat_files = [file for file in data_path.iterdir() if file.is_file() and file.suffix == ".dat"]
dat_files

[PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL4/EBIT.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL4/FX trading.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL4/Sales.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL4/Variable Costs over STD.dat')]

In [5]:
def read_multiple_files(list_of_files):
    dataframes = [
        pd.read_csv(
            file,
            sep="\t",
        )
        for file in list_of_files
    ]
    
    # Add a new column with filename to each DataFrame
    for i, df in enumerate(dataframes):
        df["source"] = list_of_files[i].stem

    # Merge the list of DataFrames into a single DataFrame
    df = pd.concat(dataframes)

    # reorder columns
    df = df[["source"] + [col for col in df.columns if col not in ["source"]]]
    
    return df

In [6]:
def change_column_names(df):
    return df.rename(
        columns={
            # "source": "Items",
            "OneGL B/S + P/L": "OneGL",
            "01": "Jan", "02": "Feb", "03": "Mar",
            "04": "Apr", "05": "May", "06": "Jun",
            "07": "Jul", "08": "Aug", "09": "Sep",
            "10": "Oct", "11": "Nov", "12": "Dec"
        }
    )

In [7]:
df = read_multiple_files(dat_files)
df = change_column_names(df)
df.head(3)

Unnamed: 0,source,OneGL,0-2,1-2,CF,Jan,Feb,Mar,Apr,May,...,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45
0,EBIT,50802-018 P SC Mechatronic Sensors Sejong,208503808,208503808,0.0,-25508605.0,234012400.0,0.0,0.0,0.0,...,,,,,,,,,,
1,EBIT,50803-003 E CT Engine Controls Icheon,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,EBIT,50803-004 E Central Icheon NPF,1654517315,1654517315,0.0,-137988495.0,1792506000.0,0.0,0.0,0.0,...,,,,,,,,,,


### Data wrangling

In [8]:
t = ibis.memtable(df, name="t")
t = t.select("source", "OneGL", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
t = (t
        .mutate(source = _.source.replace(".dat", ""))
        .mutate(PrCr = _.OneGL.re_extract(r"([0-9\-]{8,9})", 1))
        .mutate(s.across(s.numeric(), _ / -10**6))
        .filter(_.PrCr != "")  # instead of .dropna("PrCr")
        .rename({'Items':"source"})
    )
t

In [9]:
df = t.to_pandas()
df.to_csv(output_path / "SAP YGL4 P&L.csv", index=False)