In [2]:
import pandas as pd
from abc import ABC, abstractmethod

In [19]:
DATA_FOLDER = "../data/"

### Set Up Items

In [8]:
ITEMS_PATH = DATA_FOLDER + "CFTP Test Item Inventory with Dimensions - All Trials.xlsx"
items = pd.read_excel(ITEMS_PATH, sheet_name=0, skiprows=3)
items.head(2)

Unnamed: 0,Item ID,Item Format,Brand,Manufacturer,Item SKU,Item Name,Item Description Refined,Item Description From Trial,Item ID.1,Material Class I,...,Material Composition,Certification @ time of testing,Kit,Initial Weight 1,Initial Weight 2,Initial Weight 3,"Average Initial Weight, g",Item Dimensions Compiled,"Item Capacity, mL",Notes
0,A1,Bowl,BÉSICS®,WeiMon,WM-W270,PLA-lined Paper Bowl 12oz,BÉSICS® 12 oz Soup bowl,BESICS Bowl,,Fiber,...,"Paper, PLA lining, adhesive, ink",BPI,Baseline,8.12,8.1,8.12,8.113333,"3’’ diameter, 2.5’’ H",,
1,A2,Hot Cup Lid,BÉSICS®,Multiple,,CPLA Hot Cup Lid,BÉSICS® 12oz CPLA Hot Cup Lid,Hot cup lid,,Biopolymer,...,Crystallized PLA,BPI,Baseline,3.58,3.58,3.56,3.573333,"3.5’’ diameter, 0.5’’ H",,


In [11]:
items['Start Weight'] = items['Average Initial Weight, g']

In [9]:
items_cols = [
    'Item ID',
    'Item Name',
    'Item Description Refined',
    'Material Class I',
    'Material Class II',
    'Material Class III',
    'Start Weight'
    ]

In [12]:
items_clean = items[items_cols]
items_clean.head(2)

Unnamed: 0,Item ID,Item Name,Item Description Refined,Material Class I,Material Class II,Material Class III,Start Weight
0,A1,PLA-lined Paper Bowl 12oz,BÉSICS® 12 oz Soup bowl,Fiber,Lined Fiber,PLA lined Paper,8.113333
1,A2,CPLA Hot Cup Lid,BÉSICS® 12oz CPLA Hot Cup Lid,Biopolymer,Rigid Biopolymer (< 0.75mm),CPLA,3.573333


In [13]:
item2id = {key.strip(): value for key, value in items_clean.set_index('Item Description Refined')['Item ID'].to_dict().items()}
item2id

{'BÉSICS® 12 oz Soup bowl': 'A1',
 'BÉSICS® 12oz CPLA Hot Cup Lid': 'A3',
 'BÉSICS® 250mL PLA-lined Bagasse Leaf Bowl': 'A4',
 'BÉSICS® 8oz CPLA Hot cup lid': 'A16',
 'BÉSICS® Box with Lid AND Fry Tray': 'A6',
 'BÉSICS® Lined Paper Box with Lid': 'A7',
 'BÉSICS® Spoon 6"': 'A9',
 'BÉSICS® Uncoated paper fry tray': 'Q',
 'Fabrikal 16 oz PLA cold cup': 'A11',
 'Kraft Control 10"x5" 1-ply': 'A12',
 'Bagasse Clamshell 9x9': 'O',
 'BÉSICS® 12oz Hot Cup': 'A14',
 'BÉSICS® 6" Spoon': 'A15',
 'Bin Liner Bag 2.5 gal': 'A17',
 'Cellulose bag CONTROL': 'A18',
 'Coffee Pod': 'A19',
 'Kraft butcher paper CONTROL': 'A20',
 'Moulded fiber bowl 16oz': 'A21',
 'Navel orange peel': 'A22',
 'PLA Cold Cup 12oz': 'A23',
 'PLA Cold Cup Lid 8oz': 'V',
 'Spoon PLA 6"': 'A25',
 'Straw PHA 8"': 'A26',
 '16oz NoTree Paper Hot Cup - World Centric': 'P',
 '3Gallon Food Scrap Bag - World Centric': 'K3',
 '8" Kraft straw ST-PA-8-K': 'B',
 'Alter Eco Quinoa SUP 2018': 'A30',
 'BÉSICS® Leaf Plate': 'A31',
 'Elk Packag

### Define Pipeline Class

In [16]:
class AbstractDataPipeline(ABC):
    @abstractmethod
    def load_data(self):
        pass

    @abstractmethod
    def process_data(self, data):
        pass

    @abstractmethod
    def save_data(self, data):
        pass

    def run(self):
        data = self.load_data()
        processed_data = self.process_data(data)
        self.save_data(processed_data)

In [17]:
import pandas as pd

class ClosedLoopPipeline(AbstractDataPipeline):
    def load_data(self, excel_path, sheet_name, skiprows):
        """Loads data from a specific sheet in an Excel file."""
        return pd.read_excel(excel_path, sheet_name=sheet_name, skiprows=skiprows)

    def process_data(self, weight_df, area_df, items_clean):
        """Processes the weight and area DataFrames, then merges with items_clean."""
        # Filter for "Second Removal"
        weight = weight_df[weight_df['Trial Stage'] == "Second Removal"]
        area = area_df[area_df['Trial Stage'] == "Second Removal"]
        
        # Melt the DataFrames
        weight_melted = self.melt_data_frame(weight, '% Residuals (Weight)')
        area_melted = self.melt_data_frame(area, '% Residuals (Area)')

        # Merge melted DataFrames
        observations_closed_loop = pd.merge(weight_melted, area_melted, 
                                            on=['Facility Name', 'Trial Stage', 'Bag Set', 'Bag Number', 'Item ID'], 
                                            how='outer')
        
        # Rename and select relevant columns
        observations_closed_loop.rename(columns={'Facility Name': 'Trial'}, inplace=True)
        observations_closed_loop = observations_closed_loop[['Trial', 'Item ID', '% Residuals (Weight)', '% Residuals (Area)']]
        
        # Join with items_clean and select columns
        joined_cl = pd.merge(items_clean, observations_closed_loop, on="Item ID")
        keep_cols = ["Trial", "Item ID", "Item Name", "Item Description Refined",
                     "Material Class I", "Material Class II", "Material Class III",
                     "Start Weight", "% Residuals (Weight)", "% Residuals (Area)"]
        return joined_cl[keep_cols]

    def melt_data_frame(self, df, value_name):
        """Helper method to melt DataFrames."""
        return df.melt(id_vars=['Facility Name', 'Trial Stage', 'Bag Set', 'Bag Number'],
                       value_vars=['N', 'O', 'Q', 'V', 'B', 'D', 'H', 'I', 'J', 'K', 'K1', 'K2', 'K3', 'N', 'O', 'P', 'Q', 'S', 'V'],
                       var_name='Item ID',
                       value_name=value_name).dropna(subset=[value_name]).reset_index(drop=True)

    def save_data(self, df, output_path):
        """Saves the DataFrame to a CSV file."""
        df.to_csv(output_path, index=False)


In [20]:
closed_loop_pipeline = ClosedLoopPipeline()
TEN_TRIALS_PATH = DATA_FOLDER + "Donated Data 2023 - Compiled Field Results for DSI.xlsx"
items_clean = pd.DataFrame()  # Assuming this is prepared elsewhere

observations_weight = closed_loop_pipeline.load_data(TEN_TRIALS_PATH, sheet_name=3, skiprows=2)
observations_sa = closed_loop_pipeline.load_data(TEN_TRIALS_PATH, sheet_name=4, skiprows=2)
processed_data = closed_loop_pipeline.process_data(observations_weight, observations_sa, items_clean)

KeyError: 'Item ID'

In [21]:
observations_weight.head()

Unnamed: 0,Facility Name,Trial ID,Trial Stage,Bag Set,Bag Number,N,O,Q,V,B,...,K,K1,K2,K3,N.1,O.1,P,Q.1,S,V.1
0,Facility 1 ( Windrow),WR004-01,First Removal,A (blue),10,,,,,0.059,...,,0.986,,0.546,,,,,,
1,Facility 1 ( Windrow),WR004-01,First Removal,A (blue),6,,,,,0.022,...,,0.696,0.007,0.572,,,,,,
2,Facility 1 ( Windrow),WR004-01,First Removal,A (blue),7,,,,,0.018,...,,0.933,0.023,0.313,,,,,,
3,Facility 1 ( Windrow),WR004-01,First Removal,A (blue),8,,,,,0.22,...,,0.909,0.0,0.412,,,,,,
4,Facility 1 ( Windrow),WR004-01,First Removal,A (blue),9,,,,,0.028,...,,0.928,0.05,0.65,,,,,,


In [None]:
# Save data
closed_loop_pipeline.save_data(processed_data, "processed_data.csv")