In [1]:
import pandas as pd

In [2]:
DATA_FOLDER = "../data/"

### Clean Items

In [3]:
ITEMS_PATH = DATA_FOLDER + "CFTP Test Item Inventory with Dimensions - All Trials.xlsx"
items = pd.read_excel(ITEMS_PATH, sheet_name=0, skiprows=3)
items.head(2)

Unnamed: 0,Item ID,Item Format,Brand,Manufacturer,Item Name,Item Description Refined,Item Description From Trial,Material Class I,Material Class II,Material Class III,Material Description,Material Composition,Certification @ time of testing,Kit,Initial Weight 1,Initial Weight 2,Initial Weight 3,"Average Initial Weight, g",Item Dimensions Compiled,"Item Capacity, mL"
0,,Bag,BÉSICS®,Pak-Sel,Cellulose bag 5x7in,BESICS Cellulose bag 5x7in,,Biopolymer,Biopolymer Film/Bag,Cellulose,Cellulose,Cellulose,BPI,Pilot,,,,2.5,,0.36
1,,Bag,World Centric™,World Centric™,PBAT Bin Liner 3 gallon,3Gallon Food Scrap Bag - World Centric,3Gallon Food Scrap Bag BG-CS-3,Biopolymer,Biopolymer Film/Bag,PBAT and corn starch,PBAT and corn starch,"70% PBAT, 30% starch",BPI,Custom,7.16,7.16,7.15,7.156667,"8.75""x15""x0.1""",


In [4]:
for col in items.columns:
    print(col)

Item ID
Item Format
Brand
Manufacturer
Item Name
Item Description Refined
Item Description From Trial
Material Class I
Material Class II
Material Class III
Material Description
Material Composition
Certification @ time of testing
Kit 
Initial Weight 1
Initial Weight 2
Initial Weight 3
Average Initial Weight, g
Item Dimensions Compiled
Item Capacity, mL


In [5]:
items['Start Weight'] = items['Average Initial Weight, g']

In [6]:
items_cols = [
    'Item ID',
    'Item Name',
    'Item Description Refined',
    'Material Class I',
    'Material Class II',
    'Material Class III',
    'Start Weight'
    ]

In [7]:
items_clean = items[items_cols]
items_clean.head(2)

Unnamed: 0,Item ID,Item Name,Item Description Refined,Material Class I,Material Class II,Material Class III,Start Weight
0,,Cellulose bag 5x7in,BESICS Cellulose bag 5x7in,Biopolymer,Biopolymer Film/Bag,Cellulose,2.5
1,,PBAT Bin Liner 3 gallon,3Gallon Food Scrap Bag - World Centric,Biopolymer,Biopolymer Film/Bag,PBAT and corn starch,7.156667


In [8]:
# ITEMS_SAVE_PATH = ""
# items.to_csv(ITEMS_SAVE_PATH, index=False)

### Clean Facilities

In [9]:
FACILITIES_PATH = DATA_FOLDER + "Compiled Facility Conditions for DSI - 2023 trials.xlsx"
facilities = pd.read_excel(FACILITIES_PATH, sheet_name=1, skiprows=1)
facilities.head(2)

Unnamed: 0,Facility ID CFTP,Trial ID CFTP,Public Trial ID,Trial Facility Name
0,45184,45184-01,WR004-01,Facility 1 (Windrow)
1,50361,50361-01,CASP005-01,Facility 2 (CASP)


### Clean Facility Observations

In [10]:
facility2id = {
    "Facility 1 (Windrow)": "WR004-01",
    "Facility 2 (CASP)": "CASP005-01",
    "Facility 3 (EASP)": "EASP001-01",
    "Facility 4 (In-Vessel)": "IV002-01",
    "Facility 5 (EASP)": "EASP002-01",
    "Facility 6 (CASP)": "CASP006-01",
    "Facility 7 (CASP)": "CASP004-02",
    "Facility 8 (ASP)": "ASP001-01",
    "Facility 9 (EASP)": "EASP003-01", 
    "Facility 10 (Windrow)": "WR005-01" 
}

### Clean Trial Observations

In [11]:
TEN_TRIALS_PATH = DATA_FOLDER + "Compiled Field Results  for DSI - 2023 Bulk 10 Trial Data.xlsx"
observations_weight = pd.read_excel(TEN_TRIALS_PATH, sheet_name=3, skiprows=2)
observations_sa = pd.read_excel(TEN_TRIALS_PATH, sheet_name=4, skiprows=2)
observations_sa.head(2)

Unnamed: 0,Facility Name,Trial Stage,Bag Set,Bag Number,N,O,Q,V,B,D,...,K,K1,K2,K3,N.1,O.1,P,Q.1,S,V.1
0,Facility 1 (Windrow),First Removal,A (blue),10,,,,,0.244,0.039,...,,0.618,0.233,0.225,,,,,,
1,Facility 1 (Windrow),First Removal,A (blue),6,,,,,0.075,0.237,...,,0.579,0.023,0.197,,,,,,


In [12]:
# only use second removal
weight = observations_weight[observations_weight['Trial Stage'] == "Second Removal"]
area = observations_sa[observations_sa['Trial Stage'] == "Second Removal"]

In [13]:
weight_melted = weight.melt(id_vars=['Facility Name', 'Trial Stage', 'Bag Set', 'Bag Number'],
                    value_vars=['N', 'O', 'Q', 'V', 'B', 'D', 'H', 'I', 'J', 'K', 'K1', 'K2', 'K3', 'N', 'O', 'P', 'Q', 'S', 'V'],
                    var_name='Item ID',
                    value_name='% Residuals (Weight)')
weight_melted = weight_melted.dropna(subset=['% Residuals (Weight)']).reset_index(drop=True)
weight_melted.head(2)

Unnamed: 0,Facility Name,Trial Stage,Bag Set,Bag Number,Item ID,% Residuals (Weight)
0,Facility 1 (Windrow),Second Removal,B (green),1,N,0.0
1,Facility 1 (Windrow),Second Removal,B (green),2,N,1.002848


In [14]:
area_melted = area.melt(id_vars=['Facility Name', 'Trial Stage', 'Bag Set', 'Bag Number'],
                    value_vars=['N', 'O', 'Q', 'V', 'B', 'D', 'H', 'I', 'J', 'K', 'K1', 'K2', 'K3', 'N', 'O', 'P', 'Q', 'S', 'V'],
                    var_name='Item ID',
                    value_name='% Residuals (Area)')
area_melted = area_melted.dropna(subset=['% Residuals (Area)']).reset_index(drop=True)
area_melted.head(2)

Unnamed: 0,Facility Name,Trial Stage,Bag Set,Bag Number,Item ID,% Residuals (Area)
0,Facility 1 (Windrow),Second Removal,B (green),1,N,0.0
1,Facility 1 (Windrow),Second Removal,B (green),2,N,0.720347


In [15]:
observations = pd.merge(weight_melted, area_melted, on=['Facility Name', 'Trial Stage', 'Bag Set', 'Bag Number', 'Item ID'], how='outer')
observations.head(2)

Unnamed: 0,Facility Name,Trial Stage,Bag Set,Bag Number,Item ID,% Residuals (Weight),% Residuals (Area)
0,Facility 1 (Windrow),Second Removal,B (green),1,N,0.0,0.0
1,Facility 1 (Windrow),Second Removal,B (green),2,N,1.002848,0.720347


In [16]:
# TODO: We have some missing observations
len(observations), len(weight_melted), len(area_melted)

(787, 781, 772)

In [17]:
observations[observations['% Residuals (Weight)'].isna() | observations['% Residuals (Area)'].isna()]

Unnamed: 0,Facility Name,Trial Stage,Bag Set,Bag Number,Item ID,% Residuals (Weight),% Residuals (Area)
571,Facility 9 (EASP),Second Removal,A (blue),1,K2,0.052,
572,Facility 9 (EASP),Second Removal,A (blue),3,K2,0.134,
573,Facility 9 (EASP),Second Removal,A (blue),5,K2,0.148,
574,Facility 9 (EASP),Second Removal,A (blue),7,K2,0.048,
575,Facility 9 (EASP),Second Removal,A (blue),8,K2,0.075,
576,Facility 9 (EASP),Second Removal,B (green),10,K2,0.04,
577,Facility 9 (EASP),Second Removal,B (green),3,K2,0.043,
578,Facility 9 (EASP),Second Removal,B (green),7,K2,0.059,
579,Facility 9 (EASP),Second Removal,B (green),8,K2,0.166,
580,Facility 9 (EASP),Second Removal,B (green),9,K2,0.124,


### Join With Items

In [46]:
joined = pd.merge(items, observations, on="Item ID")
joined.head(2)

Unnamed: 0,Item ID,Item Format,Brand,Manufacturer,Item Name,Item Description Refined,Item Description From Trial,Material Class I,Material Class II,Material Class III,...,"Average Initial Weight, g",Item Dimensions Compiled,"Item Capacity, mL",Start Weight,Facility Name,Trial Stage,Bag Set,Bag Number,% Residuals (Weight),% Residuals (Area)
0,Q,Bowl,BÉSICS®,GQPP,PLA-lined Bagasse Bowl 300mL,BESICS 250mL PLA-lined Bagasse Leaf Bowl,"PLA-lined fibre bowl, white",Fiber,Lined Fiber,PLA lined Bagasse,...,10.88,5.5’’ x 5.5’’ x 2’’,300.0,10.88,Facility 1 (Windrow),Second Removal,B (green),1,0.734354,0.52637
1,Q,Bowl,BÉSICS®,GQPP,PLA-lined Bagasse Bowl 300mL,BESICS 250mL PLA-lined Bagasse Leaf Bowl,"PLA-lined fibre bowl, white",Fiber,Lined Fiber,PLA lined Bagasse,...,10.88,5.5’’ x 5.5’’ x 2’’,300.0,10.88,Facility 1 (Windrow),Second Removal,B (green),2,0.0,0.0


### Create Visualizations

In [88]:
import plotly.graph_objects as go
import numpy as np
import matplotlib.colors as mcolors

class2color = {
    'Positive Control': '#70AD47',
    'Mixed Materials': '#48646A',
    'Fiber': '#298FC2',
    'Biopolymer': '#FFB600',
}

In [20]:
joined['Material Class I'].unique(), joined['Material Class II'].unique()

(array(['Fiber', 'Biopolymer', 'Positive Control', 'Mixed Materials'],
       dtype=object),
 array(['Lined Fiber', 'Rigid biopolymer', 'Rigid Biopolymer (> 0.75mm)',
        'Positive Control - Fiber', 'Positive Control - Food Scraps',
        'Positive Control - Film', 'Mixed Materials', 'Unlined Fiber',
        'Rigid Biopolymer (< 0.75mm)', 'Biopolymer Film/Bag'], dtype=object))

In [133]:
class_I_order = ['Fiber', 'Biopolymer', 'Mixed Materials', 'Positive Control']
class_II_order = ['Unlined Fiber', 'Lined Fiber', 'Biopolymer Film/Bag', 'Rigid Biopolymer (< 0.75mm)', 'Rigid Biopolymer (> 0.75mm)', 'Rigid biopolymer', 'Mixed Materials', 'Positive Control - Fiber', 'Positive Control - Food Scraps', 'Positive Control - Film']

In [134]:
def box_and_whisker(df_input, column, class_I=None, cap=False, height=800, width=1000):
    df = df_input.copy() # prevent modifying actual dataframe
    
    data = []
    x_labels = []

    if cap:
        df[column] = df[column].clip(upper=1)
    if class_I:
        df = df[df['Material Class I'] == class_I]
        
    max_value = df[column].max()

    for class_II in class_II_order:
        group = df[df['Material Class II'] == class_II]
        if not group.empty:
            count = group[column].count()
            class_I_name = group['Material Class I'].iloc[0]
            color = class2color.get(class_I_name, '#000')
            trace = go.Box(y=group[column], name=class_II, boxpoints='outliers', marker_color=color, width=.2)
            data.append(trace)
            x_labels.append(f"{class_II}<br>n={count}")

    layout = go.Layout( 
        title_font=dict(size=14, family='Roboto'),
        font=dict(family='Roboto', size=11),
        height=height,
        width=width,
        showlegend=False,
        xaxis=dict(
            tickmode='array',
            tickvals=list(range(len(x_labels))),
            ticktext=x_labels,
            title_font=dict(size=14),
            tickfont=dict(size=11),
            tickangle=90
        ),
        yaxis=dict(
            title=column,
            tickformat=".0%",
            tickmode='array',
            tickvals=np.arange(0, max_value, 0.25),  # Adjust this range if your data is not percentage-based
            title_font=dict(size=12),
            tickfont=dict(size=9),
        ),
    )

    fig = go.Figure(data=data, layout=layout)
    fig.show()


In [135]:
def residuals_bar(df_input, class_I=None, cap=False, height=800, width=1000):
    df = df_input.copy() # prevent modifying actual dataframe
    
    # Create weight columns
    column = '% Residuals (Weight)'
    df['End Weight'] = df[column] * df['Start Weight']

    data = []
    x_labels = []

    if cap:
        df[column] = df[column].clip(upper=1)
    if class_I:
        df = df[df['Material Class I'] == class_I]

    # Weird setup to handle null start weights and still plot correctly
    filtered_class_II_order = [class_II for class_II in class_II_order if not df[df['Material Class II'] == class_II]['Start Weight'].isna().all()]
    x_positions = np.arange(len(filtered_class_II_order))  # Numeric x-axis positions

    for i, class_II in enumerate(filtered_class_II_order):
        group = df[df['Material Class II'] == class_II]
        if not group.empty:
            count = group[column].count()
            class_I_name = group['Material Class I'].iloc[0]
            color = class2color.get(class_I_name, '#000')
            if not group['Start Weight'].isna().all():
                trace_start = go.Bar(x=[x_positions[i] - 0.2], y=[group['Start Weight'].sum()], marker_color=color, width=.3)
                trace_end = go.Bar(x=[x_positions[i] + 0.2], y=[group['End Weight'].sum()], marker_color=color, width=.3, opacity=.6)
                data.append(trace_start)
                data.append(trace_end)
                x_labels.append(f"{class_II}<br>n={count}")

    layout = go.Layout(
        barmode='group',
        title_font=dict(size=14, family='Roboto'),
        font=dict(family='Roboto', size=11),
        height=height,
        width=width,
        showlegend=False,
        xaxis=dict(
            tickmode='array',
            tickvals=list(range(len(x_labels))),
            ticktext=x_labels,
            title_font=dict(size=14),
            tickfont=dict(size=11),
            tickangle=90
        ),
        yaxis=dict(
            title="Total Weight in Grams (Start and End)",
            # tickformat=".0%",  # Formats the tick labels as percentages
            tickmode='array',
            title_font=dict(size=12),
            tickfont=dict(size=9),
        ),
    )

    fig = go.Figure(data=data, layout=layout)
    fig.show()

In [136]:
def residuals_bar(df_input, class_I=None, cap=False, height=800, width=1000):
    df = df_input.copy() # prevent modifying actual dataframe
    
    # Create weight columns
    column = '% Residuals (Weight)'
    df['End Weight'] = df[column] * df['Start Weight']

    data = []
    x_labels = []

    if cap:
        df[column] = df[column].clip(upper=1)
    if class_I:
        df = df[df['Material Class I'] == class_I]

    # Weird setup to handle null start weights and still plot correctly
    filtered_class_II_order = [class_II for class_II in class_II_order if not df[df['Material Class II'] == class_II]['Start Weight'].isna().all()]
    x_positions = np.arange(len(filtered_class_II_order))  # Numeric x-axis positions

    for i, class_II in enumerate(filtered_class_II_order):
        group = df[df['Material Class II'] == class_II]
        if not group.empty:
            count = group[column].count()
            class_I_name = group['Material Class I'].iloc[0]
            color = class2color.get(class_I_name, '#000')
            if not group['Start Weight'].isna().all():
                trace_start = go.Bar(x=[x_positions[i] - 0.2], y=[group['Start Weight'].sum()], marker_color=color, width=.3)
                trace_end = go.Bar(x=[x_positions[i] + 0.2], y=[group['End Weight'].sum()], marker_color=color, width=.3, opacity=.6)
                data.append(trace_start)
                data.append(trace_end)
                x_labels.append(f"{class_II}<br>n={count}")

    layout = go.Layout(
        barmode='group',
        title_font=dict(size=14, family='Roboto'),
        font=dict(family='Roboto', size=11),
        height=height,
        width=width,
        showlegend=False,
        xaxis=dict(
            tickmode='array',
            tickvals=list(range(len(x_labels))),
            ticktext=x_labels,
            title_font=dict(size=14),
            tickfont=dict(size=11),
            tickangle=90
        ),
        yaxis=dict(
            title="Total Weight in Grams (Start and End)",
            tickmode='array',
            title_font=dict(size=12),
            tickfont=dict(size=9),
        ),
    )

    fig = go.Figure(data=data, layout=layout)
    fig.show()

In [142]:
def mean_residuals_bar(df_input, class_I=None, cap=False, height=800, width=1000):
    df = df_input.copy()  # prevent modifying actual dataframe
    
    column = '% Residuals (Weight)'  # Column for mean residuals

    if cap:
        df[column] = df[column].clip(upper=1)
    if class_I:
        df = df[df['Material Class I'] == class_I]

    data = []
    x_labels = []
    x_positions = np.arange(len(class_II_order))  # Numeric x-axis positions

    for i, class_II in enumerate(class_II_order):
        group = df[df['Material Class II'] == class_II]
        if not group.empty:
            mean_residual = group[column].mean()
            class_I_name = group['Material Class I'].iloc[0]
            color = class2color.get(class_I_name, '#000')

            trace = go.Bar(x=[x_positions[i]], y=[mean_residual], marker_color=color, name=class_II, width=.3)
            data.append(trace)
            x_labels.append(f"{class_II}<br>n={len(group)}")

    layout = go.Layout(
        barmode='group',
        title_font=dict(size=14, family='Roboto'),
        font=dict(family='Roboto', size=11),
        height=height,
        width=width,
        showlegend=False,
        xaxis=dict(
            tickmode='array',
            tickvals=x_positions,
            ticktext=x_labels,
            title_font=dict(size=14),
            tickfont=dict(size=11),
            tickangle=90
        ),
        yaxis=dict(
            title="Mean % Residuals (Weight)",
            tickformat=".0%",
            tickmode='array',
            title_font=dict(size=12),
            tickfont=dict(size=9),
        ),
    )

    fig = go.Figure(data=data, layout=layout)
    fig.show()

In [143]:
joined['Material Class I'].unique()

array(['Fiber', 'Biopolymer', 'Positive Control', 'Mixed Materials'],
      dtype=object)

In [144]:
column = '% Residuals (Weight)'
# column = '% Residuals (Area)'
box_and_whisker(joined, column)
residuals_bar(joined)
mean_residuals_bar(joined)