<div class="alert alert-block alert-info">
This script <b>cleans the raw data by dropping some columns of the <code>df_master_raw</code></b>. 
    <hr> 
    Note: <br>
    <i><b>Input file(s)' name(s) and metadata</b></i> (if available) are <b>printed out (below 👇🏼) in 'read data to df' section.</b>
</div>

In [None]:
# %env
# %who_ls
# %who
# %who int
# %pinfo <var name>

# Imports

In [None]:
%config IPCompleter.use_jedi = False # disable jedi autocompleter (https://stackoverflow.com/a/65734178/14485040)

import project_path  # makes possible the access to `src` directory using relative path
from src.utils import explore_dir, make_readme_info
from src.utils import read_excel_to_pandas as r_excel
from src.utils import set_outputs_dir
from src.utils import write_pandas_to_excel as w_excel

%run init_nb.ipynb

# INPUTS: Identify file(s) and read data to df

In [None]:
# Explore the directory to find the file(s)
inputs_dir, files_list = explore_dir(
    path_to_dir=r"..\data\interim", file_extension="xlsx", print_files_list=True
)

<div class="alert alert-block alert-danger">
    <strong> pending (possible) improvements: </strong> <br>


1. FIND WITH A REGULAR EXPRESSION! # THE PATTERN IS THE TUPLE LIKE NAMING OF THE METHODS !!!


</div>

In [None]:
# Process raw data

# Master df with raw data
df_master_raw = r_excel(inputs_dir, "mapped-lcia-results.xlsx", sheets="Sheet1")
print(
    "df of the master data (raw) ".ljust(40, "."),
    f"{df_master_raw.shape}\n".rjust(13, "."),
)

# Get unique names of the LCIA methods in a list
LCIA_METHODS = r_excel(
    inputs_dir, "mapped-lcia-results.xlsx", sheets="df_lcia_labels", show_readme=False
)["Method"].to_list()

print("Unique names of LCIA methods ({} in total):".format(len(LCIA_METHODS)))
print(
    "".join(map('\n\t"{}", '.format, LCIA_METHODS))
)  # unique method names from all the workbooks

# Operations 
- drop redundant and unnecessary columns
<div class="alert alert-block alert-info">
created: <code>df_analysis_prev</code>
</div>

## Identify columns w/ method labels and list "non-method" columns

In [None]:
# a. select all the methods, make a dictionary
"""creates a dictionary -> {'method': [method labels in df]}
        {'method1': ["('method1', 'category1', 'unit1')", "('method1', 'category2', 'unit2')", ...], 
         'method2': [...]
"""
dict_fullMethods = {}

for method in LCIA_METHODS:
    lst = []
    for label in df_master_raw.columns:
        if method in label:
            lst.append(label)
    dict_fullMethods.setdefault(method, []).extend(
        lst
    )  # should be .extend() ! not .append()

# b. flat list of df's labels corresponding to a method
LCIA_METHODS_PER_CATEGORY = [
    value for key in dict_fullMethods.keys() for value in dict_fullMethods[key]
]
# (an alternative) [item for sublist in list(dict_fullMethods.values()) for item in sublist]
print(
    "df_master_raw (consisting of {} columns) contains a list of {} methods."
    "\n\nHere is a sample of 3 randomly shown methods:"
    "\n\t- {}\n\t- {}\n\t- {}"
    "\n\n*Check the full list of methods by printing 'LCIA_METHODS_PER_CATEGORY',\n"
    "or using 'dict_fullMethods' dictionary with keys in 'LCIA_METHODS'.".format(
        len(df_master_raw.columns),
        len(LCIA_METHODS_PER_CATEGORY),
        *random.sample(LCIA_METHODS_PER_CATEGORY, 3)
    )
)
# c. rest of the columns in df_master_raw
rest_of_columns = [col for col in df_master_raw.columns if col not in LCIA_METHODS_PER_CATEGORY]
print(
    "\nThe rest of the {} columns, shown below, "
    "may contain redundant or unnecessary information,"
    "\nfill free to select only required columns.".format(
        len(rest_of_columns)
    )
)
print("".join(map('\n\t"{}", '.format, rest_of_columns)))

## Select columns w/ non-method labels
<div class="alert alert-block alert-danger">
    <strong> <code>METADATA</code> has to be populated manually ❗ </strong>
</div>

In [None]:
# 2. Pick from the rest of the columns
print(df_master_raw[rest_of_columns].nunique())

# list of df's non-method labels (select manually from the list printed above)
METADATA = [
    "Activity",
    "activity_comment",
    "type",
    "referenceProduct",
    "category",
    "inline_comment",
    # 👆🏼 above columns are originally from _SP,
    # 👇🏼 below from _EI
    "geo",
    "activity_ISICclass",
    "activity_ecoSpold01class",
    "technologyLevel",
    "referenceProductAmount",
    "referenceProductUnit",
    "referenceProduct_prodVolume",
    "referenceProduct_prodVolumeComment",
    "referenceProduct_price",
    "referenceProduct_priceUnit",
    "referenceProduct_priceComment",
    "referenceProduct_casNumber",
    "referenceProduct_CPCclass",
    "activity_generalComment",
    "sourceFilename",
]
print(
    "\nTotal ºn of non-method columns (above) is {}, you selected {} of them.".format(
        len(rest_of_columns), len(METADATA)
    )
)

In [None]:
# Make df of METADATA for later export

df_metadata = pd.DataFrame(METADATA, columns=["METADATA"]) 
# df_metadata

## Select columns w/ method labels
<div class="alert alert-block alert-danger">
    <strong> <code>METHODS</code> is generated here 👇🏼</strong>  <br>
     Will be used throughout the script for calculations and plotting
</div>

In [None]:
# LCIA_METHODS_PER_CATEGORY # here is the complete list of methods per category if needed
print("Here is the list of method names (again): ")
print("".join(map('\n\t"{}", '.format, LCIA_METHODS)))

In [None]:
# select from method names printed above
select_keys = [
    "IPCC 2013 GWP 100a V1.03",
    "PBs-LCIA (baseline) V0.72",
]  # change manually if needed

METHODS = []
for key in select_keys:
    METHODS += dict_fullMethods[key]
print("{} methods have been selected:".format(len(METHODS)))
del select_keys
METHODS

In [None]:
# Make df of METHODS for later export

df_methods = pd.DataFrame(METHODS, columns=["METHODS"]) 
# df_methods

## Combine selected methods and metadata
- Generate ``analysis_prev`` df (and delete ``df_master_raw`` ?)

In [None]:
# 3. Combine steps 2 and 3

df_analysis_prev = df_master_raw.filter(items=METADATA + METHODS, axis=1).copy()
## or alternatively: 
## df_analysis_prev = df_master_raw.loc[:, list(METADATA + METHODS)].copy()
df_analysis_prev.sort_values(by="Activity", inplace=True)

# del df_master_raw # delete to free memory
pd.options.display.max_columns = None

print(
    "Created **df_analysis_prev** dataframe is of {} shape.".format(
        df_analysis_prev.shape
    )
)
df_analysis_prev.sample(5)

# OUTPUTS: Export data to excel

In [None]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "raw-data-chosen-lcia-methods-and-metadata.xlsx"

df_readme = make_readme_info(
    excelName,
    "Sheet1: Raw data with chosen LCIA methods and important metadata "
    "(redundant columns and extra methods were dropped)."
    "\nMETADATA: list of relevant metadata used in Sheet1."    
    "\nMETHODS: list of LCIA methods used in Sheet1."
    "\n[METHODS + METADATA have to be the only column labels in Sheet1]",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={
        "Sheet1": df_analysis_prev,
        "METADATA": df_metadata,        
        "METHODS": df_methods,
    },
    readme_info=("readme", df_readme),
    ####         ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)