In [3]:
from ace.utils import prep_general_material_data, read_file

In [2]:
df = read_file("../data/system_1/PRE_MARA.csv", "CSV", {"header": "true"})

In [5]:
# Pyspark libraries
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import DataFrame

from ace.utils import enforce_schema, read_file
from ace.schemas import MARA_SCHEMA


def prep_general_material_data(
    df: DataFrame,
    col_mara_global_material_number:str,
    check_old_material_number_is_valid: bool = True,
    check_material_is_not_deleted: bool = True,
):
    """
    Filters materials based on validity of the old material number (BISMT) and deletion flag (LVORM)
    and renames the global material number column and selects required columns.

    Parameters:
    -----------
    df : DataFrame
        Input PySpark DataFrame containing material data.
    col_mara_global_material_number : str
        Column name for the global material number for the system.
    check_old_material_number_is_valid : bool, optional (default=True)
        If True, filters out rows where the old material number is invalid.
        Valid old material numbers are not in ["ARCHIVE", "DUPLICATE", "RENUMBERED"] or null.
    check_material_is_not_deleted : bool, optional (default=True)
        If True, excludes rows where the deletion flag is not null or not empty.
    rename_global_material_number : str, optional (default=None)
        If specified, renames the global material number column to this consistent name.

    Returns:
    --------
    DataFrame
        A PySpark DataFrame after applying the filters and renaming.
    """
    # Apply old material number validity filter
    if check_old_material_number_is_valid:
        df = df.filter(
            (F.col("BISMT").isNull()) | (~F.col("BISMT").isin("ARCHIVE", "DUPLICATE", "RENUMBERED"))
        )

    # Apply material not deleted filter
    if check_material_is_not_deleted:
        df = df.filter(
            (F.col("LVORM").isNull()) | (F.col("LVORM") == "")
        )

    # Rename global material number column
    df = df.withColumnRenamed(col_mara_global_material_number, "global_material_number")


    return enforce_schema(df, MARA_SCHEMA)

In [7]:
prep_general_material_data(df, "ZZMDGM").show(truncate=False)

+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
|MANDT                                                           |MATNR                                                           |MEINS                                                           |global_material_number                                          |
+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
|ad57366865126e55649ecb23ae1d48887544976efea46a48eb5d85a6eeb4d306|73247d2a426212859ed5573281c4fb0f1ac040983509226591035355f4d0fa68|72dfcfb0c470ac255cde83fb8fe38de8a128188e03ea5ba5b2a93adbea1062fa|73247d2a426212859e

In [8]:
import os

In [4]:
for file_name in os.listdir("../data/system_1/"):
    file_path = os.path.join("../data/system_1/", file_name)

    # Check if it is a file (not a subfolder)
    if os.path.isfile(file_path):
        # Extract the file name without extension
        base_name = os.path.splitext(file_name)[0]
        print(base_name)

        # Read the file based on its extension and create DataFrame
        if file_name.endswith('.csv'):
            df = read_file(file_path, "csv", {"header": "true", "inferSchema": "true"})

        # Dynamically assign the DataFrame to a variable with the same name as the file (without extension)
        globals()[base_name] = df
        print(f"Data loaded into variable: {base_name}")

PRE_AFKO
Data loaded into variable: PRE_AFKO
PRE_AFPO
Data loaded into variable: PRE_AFPO
PRE_AUFK
Data loaded into variable: PRE_AUFK
PRE_MARA
Data loaded into variable: PRE_MARA
PRE_MARC
Data loaded into variable: PRE_MARC
PRE_MBEW
Data loaded into variable: PRE_MBEW
PRE_T001
Data loaded into variable: PRE_T001
PRE_T001K
Data loaded into variable: PRE_T001K
PRE_T001W
Data loaded into variable: PRE_T001W


In [7]:
PRE_AFPO.select("LTRMI")

DataFrame[LTRMI: timestamp]

In [16]:
from ace.utils import enforce_schema

In [2]:
import os

In [11]:
PRE_AFPO.select("LTRMI").show()

+-------------------+
|              LTRMI|
+-------------------+
|2013-10-29 00:00:00|
|2016-04-24 00:00:00|
|2016-01-26 00:00:00|
|2013-09-19 00:00:00|
|2012-05-12 00:00:00|
|2016-03-12 00:00:00|
|2017-03-22 00:00:00|
|2013-06-21 00:00:00|
|2015-01-20 00:00:00|
|2018-04-19 00:00:00|
|2018-10-29 00:00:00|
|2016-07-29 00:00:00|
|2013-12-28 00:00:00|
|2018-09-30 00:00:00|
|2018-08-13 00:00:00|
|2016-07-17 00:00:00|
|2013-12-28 00:00:00|
|2017-11-24 00:00:00|
|2017-02-02 00:00:00|
|2017-04-23 00:00:00|
+-------------------+
only showing top 20 rows



In [13]:
PRE_MARA.select("LTRMI").show()

AnalysisException: Column 'LTRMI' does not exist. Did you mean one of the following? [FORMT, LVORM, NORMT, ATTYP, BFLME, BISMT, BREIT, BSTME, CMREL, ERGEI, ETIAG, ETIAR, ETIFO, FERTH, GEWEI, IHIVI, INHME, IPRKZ, KZREV, KZUMW, LABOR, LAEDA, LAENG, LIQDT, MATFI, MBRSH, MFRNR, MFRPN, MPROF, MTART, NTGEW, RBNRM, SERLV, SPROF, STFAK, STOFF, TEMPB, TRAGR, VABME, VPREH, ZZREL, AEKLK, AENAM, AESZN, BBTYP, BEGRU, BEHVO, BLANZ, BLATT, BMATN, BRGEW, BWSCL, BWVOR, CADKZ, CMETH, COMPL, CUOBF, DATAB, DISST, EAN11, EANNR, EKWSL, ENTAR, ERGEW, ERNAM, ERSDA, ERVOE, ERVOL, EXTWG, FUELG, GENNR, GEWTO, GROES, HOEHE, ILOOS, INHAL, INHBR, KOSCH, KUNNR, KZEFF, KZGVH, KZKFG, KZKUP, KZNFM, KZWSM, MAGRV, MANDT, MATKL, MATNR, MEABM, MEINS, MHDHB, MHDLP, MHDRZ, MLGUT, MSTAE, MSTAV, MSTDE, MSTDV, NRFHG, NUMTP, PLGTP, PMATA, PRDHA, PROFL, PRZUS, PSTAT, QMPUR, RAUBE, RDMHD, RMATP, SAISJ, SAISO, SAITY, SATNR, SPART, TAKLV, VHART, VOLEH, VOLTO, VOLUM, VPSTA, WESCH, WRKST, XCHPF, XGCHP, ZEIAR, ZEIFO, ZEINR, ZEIVR, ZZMDL, ZZNPH, ZZMDGM, BRAND_ID, SLED_BBD, ZZPRODMAN, MTPOS_MARA, ZZLAUNCHD, ZZMAT_GPH, ZZMAT_SBU, ZZMAT_SEG, ZZOLDPRODH, ZZORIGSRC, ZZRELCODE, ZZREVCATE, ZZTAXGROUP, sur_pkey, MATNR_CLEAN, ZZBRAND_PF, ZZBUSNSEG, ZZICM_COGS, ZZMGPH_PL, ZZMSBU_PL, ZZPRODSTEW, GTIN_VARIANT, ZZMATTAXCLASS, SOURCE_SYSTEM_ERP];
'Project ['LTRMI]
+- Relation [MANDT#798,MATNR#799,ERSDA#800,ERNAM#801,LAEDA#802,AENAM#803,VPSTA#804,PSTAT#805,LVORM#806,MTART#807,MBRSH#808,MATKL#809,BISMT#810,MEINS#811,BSTME#812,ZEINR#813,ZEIAR#814,ZEIVR#815,ZEIFO#816,AESZN#817,BLATT#818,BLANZ#819,FERTH#820,FORMT#821,... 133 more fields] csv
