In [1]:
 # In Terminal, "pip install ibis-framework[duckdb] pyjanitor"
import pandas as pd
import ibis
from ibis import selectors as s
from ibis import _
ibis.options.interactive = True

In [2]:
# Path
from pathlib import Path
path = Path("/home/uiv17345/datasets/home-dataset/jupyterlab/PnL")
data_path = path / "data" / "SAP YGL0"
meta_path = path / "meta"
output_path = path / "output"

### Input data: List of multiple text files

In [3]:
# Input data: List of multiple text files
dat_files = [file for file in data_path.iterdir() if file.is_file() and file.suffix == ".dat"]
dat_files

[PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50802-018.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50803-009.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50803-010.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50803-026.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50803-034.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50803-045.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50803-047.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50803-049.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50803-050.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/jupyterlab/PnL/data/SAP YGL0/50803-051.dat'),
 PosixPath('/home/uiv17345/datasets/home-dataset/j

In [4]:
def read_multiple_files(list_of_files):
    dataframes = [
        pd.read_csv(
            file,
            sep="\t",
        )
        for file in list_of_files
    ]
    
    # Add a new column with filename to each DataFrame
    for i, df in enumerate(dataframes):
        df["source"] = list_of_files[i].stem

    # Merge the list of DataFrames into a single DataFrame
    df = pd.concat(dataframes)

    # reorder columns
    df = df[["source"] + [col for col in df.columns if col not in ["source"]]]
    
    return df

In [5]:
def change_column_names(df):
    return df.rename(
        columns={
            # "source": "PrCr",
            "OneGL B/S + P/L": "OneGL",
            "01": "Jan", "02": "Feb", "03": "Mar",
            "04": "Apr", "05": "May", "06": "Jun",
            "07": "Jul", "08": "Aug", "09": "Sep",
            "10": "Oct", "11": "Nov", "12": "Dec"
        }
    )

In [6]:
df = read_multiple_files(dat_files)
df = change_column_names(df)
df.head(3)

Unnamed: 0,source,OneGL,0-2,1-2,CF,Jan,Feb,Mar,Apr,May,...,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45
0,50802-018,A041 Other intangible assets - APC,0,-3344000000,3344000000,-3344000000.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,50802-018,"+,*,-"" SL (M11, partner x) gross",0,-3344000000,3344000000,-3344000000.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,50802-018,A046 Adj. - Other intangible assets,0,2006400000,-2006400000,2006400000.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


### Data wrangling

In [7]:
t = ibis.memtable(df, name="t")
t = t.select("source", "OneGL", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
t = (t
        .mutate(source = _.source.replace(".dat", ""))
        .mutate(Key = _.OneGL.re_extract(r"([0-9]+|^K[0-9]+|^P[0-9]+)", 1))
        .mutate(s.across(s.numeric(), _ / -1000))
        .rename({'PrCr':"source"})
    )
t

In [8]:
lookup_df = pd.read_csv(meta_path / "Lookup_table.csv", dtype=str)
l = ibis.memtable(lookup_df, name="l")
l = l.mutate(Key = _.Key.re_extract(r"([0-9]+|^K[0-9]+|^P[0-9]+)", 1))
l.head(3)

In [9]:
joined = t.join(l, "Key", how="inner")
joined = joined.select(["PrCr", "A", "B", "C", "D"], s.numeric(), ["Key", "OneGL"])
joined

In [10]:
df = joined.to_pandas()
df.to_csv(output_path / "SAP YGL0 P&L.csv", index=False)