In [1]:
import datetime
import glob
import os

import pandas as pd
from numpy import nanmean as np_nanmean
from numpy import nanmedian as np_nanmedian
from sklearn import model_selection


# Merge Sheets


In [2]:
def date_parser(arr):
    try:
        return [datetime.datetime.strptime(x, "%m-%d-%Y %H:%M:%S") for x in arr]
    except Exception as ex1:
        try:
            return [datetime.datetime.strptime(x, "%m/%d/%Y %H:%M:%S") for x in arr]
        except Exception as ex2:
            print(ex1, ex2)
    return [None for x in arr]


available_batches = list()
for filename in glob.glob("./datasets/ODP*.xlsx"):
    df = pd.read_excel(filename, sheet_name="PO")
    available_batches.append(df["PO"].iloc[0])
    print(available_batches[-1])
    os.makedirs(f"./results/{available_batches[-1]}", exist_ok=True)
    df_BHV = pd.read_excel(
        filename, sheet_name="BHV", parse_dates=[0], date_parser=date_parser
    )
    df_BHV.columns = ["Timestamp"] + [f"{x}_BHV" for x in df_BHV.columns[1:]]
    df_CFF = pd.read_excel(
        filename, sheet_name="CFF", parse_dates=[0], date_parser=date_parser
    )
    df_CFF.columns = ["Timestamp"] + [f"{x}_CFF" for x in df_CFF.columns[1:]]
    df_NF = pd.read_excel(
        filename, sheet_name="NF", parse_dates=[0], date_parser=date_parser
    )
    df_NF.columns = ["Timestamp"] + [f"{x}_NF" for x in df_NF.columns[1:]]
    df_EXT = pd.read_excel(
        filename, sheet_name="EXT", parse_dates=[0], date_parser=date_parser
    )
    df_EXT.columns = ["Timestamp"] + [f"{x}_EXT" for x in df_EXT.columns[1:]]

    df_tmp1 = pd.merge(left=df_BHV, right=df_CFF, on="Timestamp")
    df_tmp2 = pd.merge(left=df_NF, right=df_EXT, on="Timestamp")

    df = pd.merge(left=df_tmp1, right=df_tmp2, on="Timestamp")
    df["Batch"] = available_batches[-1]
    # sort columns
    df = df[["Batch"] + [x for x in df.columns[:-1]]]
    df.to_csv(
        f"./results/{available_batches[-1]}/{available_batches[-1]}.csv", index=False
    )

with open("./results/batches.txt", "w") as f:
    for x in available_batches:
        f.write(f"{x}\n")
print(len(available_batches))


1000001700
1000001701
1000001702
strptime() argument 1 must be str, not datetime.datetime strptime() argument 1 must be str, not datetime.datetime
strptime() argument 1 must be str, not datetime.datetime strptime() argument 1 must be str, not datetime.datetime
strptime() argument 1 must be str, not datetime.datetime strptime() argument 1 must be str, not datetime.datetime
1000001769
1000001704
1000001705
1000001767
1000001769
1000001769
1000001770
1000001771
1000001772
1000001773
1000001774
1000001775
1000001776
1000001777
17


# Merge Batches in the Original Format


In [3]:
available_batches = list()
with open("./results/batches.txt", "r") as f:
    available_batches.extend(int(x) for x in f.readlines())

df = pd.DataFrame()
for batch in available_batches:
    df = df.append(pd.read_csv(f"./results/{batch}/{batch}.csv"))
df.to_csv("./results/all_batches.csv", index=False)


# GroupBy Statistical Measures


In [4]:
HYPER_bin_time = 2  # hours
HYPER_categ_agg = "mode"  # possible values: mode, last
HYPER_numer_agg = "mean"  # possible values: mean, median, last
print(
    f"Grouping Strategies:\nBin Time: {HYPER_bin_time}\nCategorical: {HYPER_categ_agg}\nNumerical: {HYPER_numer_agg}"
)


def custom_mode(x):
    m = pd.Series.mode(x)
    return m.iloc[0] if not m.empty else None


def get_agg_method(x):
    if x == "last" or x == "mean":
        return x
    elif x == "mode":
        return custom_mode
    elif x == "median":
        return np_nanmedian


previous_available_batches = list()
with open("./results/batches.txt", "r") as f:
    previous_available_batches.extend(int(x) for x in f.readlines())

available_batches = list()
for filename in glob.glob("./datasets/ODP*.xlsx"):
    df = pd.read_excel(filename, sheet_name="PO")
    batch = df["PO"].iloc[0]
    print(batch)
    if batch in previous_available_batches:
        df = pd.read_csv(f"./results/{batch}/{batch}.csv", parse_dates=[1])
        try:
            start_time = df["Timestamp"].iloc[0]
            # compute elapsed minutes since start
            elapsed_minutes = list()
            for tup in df.itertuples():
                elapsed_minutes.append(
                    round((tup.Timestamp - start_time).total_seconds() / 60, 2)
                )
            df["Elapsed Minutes"] = elapsed_minutes

            # remove datetime as no longer needed
            df.drop(["Timestamp"], axis=1, inplace=True)

            # bin elapsed minutes
            df.loc[:, "Elapsed Minutes"] = (
                df["Elapsed Minutes"]
                // (HYPER_bin_time * 60)  # floor division by X hours (obtain hours)
                * (HYPER_bin_time * 60)  # multiply again to obtain minutes
            )
            # extract categorical/numerical variables
            categorical_vars = list()
            numerical_vars = [
                x
                for x in df.columns
                if x != "Elapsed Minutes" and x not in categorical_vars
            ]
            agg_dict = {x: get_agg_method(HYPER_categ_agg) for x in categorical_vars}
            agg_dict.update(
                {x: get_agg_method(HYPER_numer_agg) for x in numerical_vars}
            )
            # apply aggregation methods
            df_tmp = df.groupby(by=["Elapsed Minutes"]).agg(agg_dict)
            df_tmp.reset_index(inplace=True)
            df_tmp["Batch"] = batch
            # sort columns
            df_tmp = df_tmp[["Batch", "Elapsed Minutes"] + [x for x in df.columns[:-2]]]
            df_tmp.to_csv(
                f"./results/{batch}/{batch}_binned.csv",
                index=False,
            )
        except Exception as ex:
            print(ex)
        else:
            available_batches.append(batch)

with open("./results/batches_binned.txt", "w") as f:
    for x in available_batches:
        f.write(f"{x}\n")
print(len(available_batches))


Grouping Strategies:
Bin Time: 2
Categorical: mode
Numerical: mean
1000001700
1000001701
1000001702
single positional indexer is out-of-bounds
1000001769
1000001704
1000001705
Could not convert 21.42741775512695321.38013076782226621.33284378051757821.28555488586425821.2382678985595721.19098091125488321.14369201660156221.09640502929687521.04911804199218821.00182914733886720.9545421600341820.90725517272949220.85996627807617220.81267929077148420.76539230346679720.71810340881347720.6708164215087920.623529434204120.5984764099121120.5768699645996120.55526161193847720.53365325927734420.5120449066162120.49043655395507820.46882820129394520.44722175598144520.42561340332031220.4040050506591820.38239669799804720.36078834533691420.3391799926757820.3175735473632820.2959651947021520.27435684204101620.25274848937988320.2311401367187520.20953178405761720.18792533874511720.16631698608398420.1447086334228520.1231002807617220.10149192810058620.07988357543945320.05827713012695320.0366687774658220.015060424

# Merge Everything


In [5]:
available_batches = list()
with open("./results/batches_binned.txt", "r") as f:
    available_batches.extend(int(x) for x in f.readlines())

df = pd.DataFrame()
for batch in available_batches:
    df = df.append(pd.read_csv(f"./results/{batch}/{batch}_binned.csv"))
df.to_csv("./results/all_batches_binned.csv", index=False)


# Split Datasets into Train-Test and Impute Missing Values


In [6]:
HYPER_categ_fillna = "mode"  # possible values: mode
HYPER_numer_fillna = "median"  # possible values: mean, median
print(
    f"Imputation Strategies:\nCategorical: {HYPER_categ_fillna}\nNumerical: {HYPER_numer_fillna}"
)


def custom_mode(x):
    m = pd.Series.mode(x)
    return m.iloc[0] if not m.empty else None


def get_fillna_method(x):
    if x == "mean":
        return np_nanmean
    elif x == "median":
        return np_nanmedian
    elif x == "mode":
        return custom_mode


available_batches = list()
with open("./results/batches_binned.txt", "r") as f:
    available_batches.extend(int(x) for x in f.readlines())
train_seq_ids, test_seq_ids = model_selection.train_test_split(
    available_batches, train_size=0.75, random_state=666
)
os.makedirs("./results/splits", exist_ok=True)
df = pd.read_csv("./results/all_batches_binned.csv")
df.sort_values(by="Batch", inplace=True)
df_targets = pd.read_excel(
    "./datasets/produzione_CStOA_2021_ed12.xlsx", sheet_name="dati-produzione", header=1
)
df_targets.sort_values(by="O.D.P.", inplace=True)

df_train = df[df["Batch"].isin(train_seq_ids)]
df_test = df[df["Batch"].isin(test_seq_ids)]

X_train = df_train.copy()
# remove duplicates of the same batch
y_train = df_targets[df_targets["O.D.P."].isin(train_seq_ids)]["Resa"]
y_train.to_csv("./results/splits/y_train.csv", index=False)

X_test = df_test.copy()
# remove duplicates of the same Batch
y_test = df_targets[df_targets["O.D.P."].isin(test_seq_ids)]["Resa"]
y_test.to_csv("./results/splits/y_test.csv", index=False)

# extract excluded/categorical/numerical variables
excluded_vars = (
    "Batch",
    "Elapsed Minutes",
)
categorical_vars = list()
numerical_vars = [
    x for x in df.columns if x not in excluded_vars and x not in categorical_vars
]
# create dictionary for fill-forwarding with the appropriate method
fillna_dict = {
    x: get_fillna_method(HYPER_categ_fillna)(X_train[x]) for x in categorical_vars
}
fillna_dict.update(
    {x: get_fillna_method(HYPER_numer_fillna)(X_train[x]) for x in numerical_vars}
)

X_train.fillna(fillna_dict, inplace=True)
X_train.to_csv("./results/splits/X_train.csv", index=False)

# impute test with values obtained from train
X_test.fillna(fillna_dict, inplace=True)
X_test.to_csv("./results/splits/X_test.csv", index=False)


Imputation Strategies:
Categorical: mode
Numerical: median


# Linearise Datasets


In [7]:
HYPER_max_pad = 15

for dataset in ("train", "test"):
    X = pd.read_csv(f"./results/splits/X_{dataset}.csv")

    new_columns = list()
    for i in range(HYPER_max_pad):
        new_columns.extend([f"{x}_{i}" for x in X.columns])

    grouped = X.groupby(X["Batch"])

    rows = list()
    for k, group in grouped.groups.items():
        # add row containing both real rows and zero rows
        if len(group) > HYPER_max_pad:
            group = group[:HYPER_max_pad]
        X_sub = X.loc[group]

        row = list()
        for i in range(len(group)):
            row.extend(X_sub.iloc[i].tolist())
        for i in range(HYPER_max_pad - len(group)):
            row.extend([0] * len(X.columns))
        rows.append(row)

    new_X = pd.DataFrame(data=rows, columns=new_columns)
    new_X.to_csv(f"./results/splits/X_{dataset}_padded_translated.csv", index=False)
