## 1. Initial setup

In [30]:
import os

os.chdir('/Users/cecilia/2024-winter-compostable')


In [31]:
import pandas as pd

all_sheets = pd.read_excel(
    'data/compiled_results/facility_condition/Donated Data 2023 - Compiled Facility Conditions for DSI.xlsx',
    sheet_name=None,
    skiprows=1 # most of the sheets have one empty row for condition names
)

In [32]:
# show all sheet names for later use
list(all_sheets.keys())

['Index',
 'Facility ID Correlation',
 'TrialDuration',
 ' Temperature',
 'Moisture In Field Table',
 'MoistureInFieldandLab',
 'O2 in field',
 'BD in field',
 ' BD field and lab',
 'OP12a - CN',
 'OP13a - pH',
 'Lab-Abridged-AllStages',
 'Lab-Full-SecondRemoval']

## 2. Data cleaning and reshape for each sheet

In [33]:
def clean_column_names(column):
    column = column.lower()
    column = column.replace(' ', '_')
    column = column.replace('/', '_per_')
    column = column.replace(':', '_to_')
    for char in ['(', ')', '*', '-', '&', '?', '\n', '#', '.']:
        column = column.replace(char, '')
    
    # remove leading and trailing underscore
    column = column.strip('_')

    return column

### 1. facility_id table

In [34]:
facility_id = all_sheets['Facility ID Correlation']
facility_id.columns = [clean_column_names(col) for col in facility_id.columns]
facility_id['trial_facility_name'] = facility_id['trial_facility_name'].str.split().str[0:2].str.join(' ')

facility_id.head()

Unnamed: 0,facility_id_cftp,trial_id_cftp,public_trial_id,trial_facility_name
0,45184,45184-01,WR004-01,Facility 1
1,50361,50361-01,CASP005-01,Facility 2
2,51137,51137-01,EASP001-01,Facility 3
3,51267,51267-01,IV002-01,Facility 4
4,53183,53183-01,EASP002-01,Facility 5


### 2. trail_duration table

In [35]:
# skip empty rows (I only skip 1 row when reading the excel, but trial_duration table starts from row 3)
column_names = all_sheets['TrialDuration'].iloc[1]
trial_duration = all_sheets['TrialDuration'].loc[2:]
trial_duration.columns = column_names
trial_duration.columns = [clean_column_names(col) for col in trial_duration.columns]
trial_duration['facility_designation'] = trial_duration['facility_designation'].str.split().str[0:2].str.join(' ')

trial_duration.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial_duration['facility_designation'] = trial_duration['facility_designation'].str.split().str[0:2].str.join(' ')


Unnamed: 0,facility_designation,midpoint_analysis,endpoint_analysis_trial_length,typical_process_from_start_to_sale,%_trial_length_to_typical_process
2,Facility 1,43,67,85.0,0.788235
3,Facility 2,45,71,75.0,0.946667
4,Facility 3,45,72,72.5,0.993103
5,Facility 4,19,49,50.0,0.98
6,Facility 5,48,93,140.0,0.664286


### 3. condition tables that measured daily

- Only *temperature* was measured daily.

In [36]:
temperature = all_sheets[' Temperature']
temperature.columns = temperature.columns.str.replace('*', '', regex=False) # remove * in facility name in order to better merge with other dataset

temperature.head()

Unnamed: 0,Day #,Facility 1 (Windrow),Facility 2 (CASP),Facility 3 (EASP),Facility 4 (In-Vessel),Facility 5 (EASP),Facility 6 (CASP),Facility 7 (CASP),Facility 8 (ASP),Facility 9 (EASP),Facility 10 (Windrow)
0,1,112.8,128.8,92.163333,103.625,89.166667,103.0,172.352,127.8,120.4,146.5
1,2,117.0,137.6,133.09,118.25,114.666667,,169.172,133.8,116.2,136.0
2,3,125.2,140.8,116.846667,117.875,129.666667,109.0,164.956,136.4,124.2,141.0
3,4,128.0,,156.8,118.428571,,,161.138,,,142.0
4,5,,,161.97,118.0,,,144.674,,,142.333333


- As can be seen above, the current table is a wide table. 

In [37]:
import numpy as np

temperature_long = pd.melt(temperature, id_vars='Day #', var_name='facility_name', value_name='temperature_f')
temperature_long.columns = [clean_column_names(col) for col in temperature_long.columns]
temperature_long['facility_name'] = temperature_long['facility_name'].str.split().str[0:2].str.join(' ') # standardize the name for later merge

temperature_long.head()


Unnamed: 0,day,facility_name,temperature_f
0,1,Facility 1,112.8
1,2,Facility 1,117.0
2,3,Facility 1,125.2
3,4,Facility 1,128.0
4,5,Facility 1,


- Change the wide table into long one: each facility each day has an obervation for temperation, which makes it easier to merge the condition tables with mass and surface area change for the 10 trials.

In [38]:
temperature_long['condition'] = 'temperature'
temperature_long['unit'] = 'F'
temperature_long['method'] = 'field'
temperature_long['trial_stage'] = pd.NA
temperature_long.rename(columns={'temperature_f': 'value'}, inplace=True)

temperature_long.head()

Unnamed: 0,day,facility_name,value,condition,unit,method,trial_stage
0,1,Facility 1,112.8,temperature,F,field,
1,2,Facility 1,117.0,temperature,F,field,
2,3,Facility 1,125.2,temperature,F,field,
3,4,Facility 1,128.0,temperature,F,field,
4,5,Facility 1,,temperature,F,field,


### 4. condition tables that measured weekly

- In-field *Moisture* and *O2* are measured weekly.

    Although lab result observations for *moisture*, *bulk density*, *C:N ratio*, and *pH* are recorded on separate sheets for different weeks, each facility only has three observations corresponding to different trial stages. So I only keep these conditions in stage-specific tables.

In [39]:
def load_wide_transform_long(sheetname, id_vars, var_name, value_name, end_row=14):
    condition = all_sheets[sheetname].iloc[:end_row]
    condition.columns = condition.columns.str.replace('*', '', regex=False)
    condition_long = pd.melt(condition, id_vars=id_vars, var_name=var_name, value_name=value_name)
    condition_long.columns = [clean_column_names(col) for col in condition_long.columns]

    return condition_long


##### moisture_field

In [40]:
moisture_field_long = load_wide_transform_long('Moisture In Field Table', 'Week', 'facility_name', '%_moisture_field')
moisture_field_long.head()

Unnamed: 0,week,facility_name,%_moisture_field
0,1,Facility 1,0.5
1,2,Facility 1,0.5
2,3,Facility 1,0.5
3,4,Facility 1,0.5
4,5,Facility 1,0.5


##### o2_field

In [41]:
o2_field_long = load_wide_transform_long('O2 in field', 'Week', 'facility_name', '%_o2_field')
o2_field_long['week'] = o2_field_long['week'].astype(int)
o2_field_long.head()

Unnamed: 0,week,facility_name,%_o2_field
0,1,Facility 1,0.075
1,2,Facility 1,0.06
2,3,Facility 1,0.07
3,4,Facility 1,0.06
4,5,Facility 1,0.07


##### Merge the two together

In [42]:
merged_weekly_condition = pd.merge(moisture_field_long, o2_field_long, on=['week', 'facility_name'])
merged_weekly_condition.head()

Unnamed: 0,week,facility_name,%_moisture_field,%_o2_field
0,1,Facility 1,0.5,0.075
1,2,Facility 1,0.5,0.06
2,3,Facility 1,0.5,0.07
3,4,Facility 1,0.5,0.06
4,5,Facility 1,0.5,0.07


In [43]:
merged_weekly_condition_long = pd.melt(merged_weekly_condition, id_vars=['week', 'facility_name'], var_name='condition', value_name='value')

# Replace condition names
condition_mapping = {
    '%_moisture_field': 'moisture',
    '%_o2_field': 'o2'
}
merged_weekly_condition_long['condition'] = merged_weekly_condition_long['condition'].replace(condition_mapping)

merged_weekly_condition_long['day'] = (merged_weekly_condition_long['week'] - 1) * 7 + 1
merged_weekly_condition_long['unit'] = '%'
merged_weekly_condition_long['method'] = 'field'
merged_weekly_condition_long['trial_stage'] = pd.NA
merged_weekly_condition_long = merged_weekly_condition_long.drop('week', axis=1)

merged_weekly_condition_long.head()

Unnamed: 0,facility_name,condition,value,day,unit,method,trial_stage
0,Facility 1,moisture,0.5,1,%,field,
1,Facility 1,moisture,0.5,8,%,field,
2,Facility 1,moisture,0.5,15,%,field,
3,Facility 1,moisture,0.5,22,%,field,
4,Facility 1,moisture,0.5,29,%,field,


### 5. condition tables that measured at 3 stages

##### field bulk density

In [44]:
BD_field = all_sheets[' BD field and lab']
BD_field = BD_field[['Facility Name', 'Timeframe', 'Bulk Density (BD) in-field (lbs/cu. yard)']]
BD_field=BD_field.rename(columns = {'Facility Name':'facility_name',
                                    'Timeframe':'trial_stage',
                                    'Bulk Density (BD) in-field (lbs/cu. yard)':'bulk_density'})

BD_field.head()

Unnamed: 0,facility_name,trial_stage,bulk_density
0,Facility 1,Loading,826.8
1,Facility 1,First Removal,1350.0
2,Facility 1,Second Removal,1410.0
3,Facility 2,Loading,810.0
4,Facility 2,First Removal,852.0


In [45]:
BD_field['condition'] = 'bulk_density'
BD_field['unit'] = 'lbs/cu. yard'
BD_field['method'] = 'field'
BD_field['day'] = pd.NA
BD_field.rename(columns={'bulk_density': 'value'}, inplace=True)

BD_field.head()

Unnamed: 0,facility_name,trial_stage,value,condition,unit,method,day
0,Facility 1,Loading,826.8,bulk_density,lbs/cu. yard,field,
1,Facility 1,First Removal,1350.0,bulk_density,lbs/cu. yard,field,
2,Facility 1,Second Removal,1410.0,bulk_density,lbs/cu. yard,field,
3,Facility 2,Loading,810.0,bulk_density,lbs/cu. yard,field,
4,Facility 2,First Removal,852.0,bulk_density,lbs/cu. yard,field,


##### lab data for all stages

In [46]:
lab_data_all_stages = all_sheets['Lab-Abridged-AllStages']
lab_data_all_stages.columns = [clean_column_names(col) for col in lab_data_all_stages.columns]
lab_data_all_stages=lab_data_all_stages.rename(columns = {'facility_number_new':'facility_name'})

lab_data_all_stages.head()

Unnamed: 0,facility_name,trial_stage,%_total_n_as_rcvd,%_total_n_dry_wt,%_organic_c_as_rcvd,%_organic_c_dry_wt,c_to_n_dry_wt,bulk_density_lbs_per_yd_as_rcvd,%_solids_as_rcvd,%_moisture_as_rcvd,ph_as_rcvd,ph_dry_wt,stability,co2_evolution_1_mg_co2c_per_g_om_per_day,co2_evolution_2_mg_co2c_per_g_ts_per_day,overall_notes
0,Facility 1,First Removal,0.0045,0.0068,0.108,0.163,24.0,1382.0,0.665,0.335,7.6,,Very Stable,0.4,0.2,
1,Facility 2,First Removal,0.0096,0.0146,0.251,0.382,26.0,652.0,0.657,0.343,7.9,,Stable,1.8,2.8,
2,Facility 3,First Removal,,,,,,,,,,,,,,45 day sample was lost by mail carrier
3,Facility 4,First Removal,0.0072,0.0135,0.224,0.421,31.0,832.0,0.532,0.468,5.0,,Stable,3.8,10.9,*18 day removal
4,Facility 5,First Removal,0.0074,0.0169,0.141,0.325,19.0,963.0,0.435,0.565,6.0,,Stable,3.9,8.2,


In [47]:
lab_data_all_stages['trial_stage'].unique()

array(['First Removal', 'Loading', 'Second Removal'], dtype=object)

- Note that loading means start.

In [48]:
# lab_data_all_stages = lab_data_all_stages.drop('overall_notes', axis=1) # drop notes
lab_data_all_stages_long = pd.melt(lab_data_all_stages, id_vars=['facility_name', 'trial_stage'], var_name='condition', value_name='value')

# Replace condition names
lab_data_all_stages_long['condition'] = lab_data_all_stages_long['condition'].str.replace(r'^%_', '', regex=True)
condition_mapping2 = {
    'bulk_density_lbs_per_yd_as_rcvd': 'bulk_density',
    'co2_evolution_1_mg_co2c_per_g_om_per_day': 'co2_evolution_1mg_co2_to_c',
    'co2_evolution_2_mg_co2c_per_g_ts_per_day': 'co2_eolution_2mg_co2_to_c'
}
lab_data_all_stages_long['condition'] = lab_data_all_stages_long['condition'].replace(condition_mapping2)

# Add unit to each condition
def determine_unit(condition):
    if condition in ('total_n_as_rcvd', 'total_n_dry_wt', 'organic_c_as_rcvd', 'organic_c_dry_wt', 'solids_as_rcvd', 'moisture_as_rcvd'):
        return '%'  
    elif condition == 'bulk_density':
        return 'lbs/cu.yard' 
    elif condition == 'co2_evolution_1mg_co2_to_c':
        return 'g OM/day'  
    elif condition == 'co2_eolution_2mg_co2_to_c':
        return 'g TS/day'  
    else:
        return pd.NA

lab_data_all_stages_long['unit'] = lab_data_all_stages_long['condition'].apply(determine_unit)

lab_data_all_stages_long['method'] = 'lab'
lab_data_all_stages_long['day'] = pd.NA

lab_data_all_stages_long


Unnamed: 0,facility_name,trial_stage,condition,value,unit,method,day
0,Facility 1,First Removal,total_n_as_rcvd,0.0045,%,lab,
1,Facility 2,First Removal,total_n_as_rcvd,0.0096,%,lab,
2,Facility 3,First Removal,total_n_as_rcvd,,%,lab,
3,Facility 4,First Removal,total_n_as_rcvd,0.0072,%,lab,
4,Facility 5,First Removal,total_n_as_rcvd,0.0074,%,lab,
...,...,...,...,...,...,...,...
415,Facility 6,Second Removal,overall_notes,,,lab,
416,Facility 7,Second Removal,overall_notes,,,lab,
417,Facility 8,Second Removal,overall_notes,,,lab,
418,Facility 9,Second Removal,overall_notes,,,lab,


## 3. Add CASP004-01 trial condition

In [49]:
casp4 = pd.read_csv('data/CASP004-01/masterfile.csv')
casp4_trial_condition = casp4[['trial_ID', 'temp_F', 'weather', 'moisture_%', 
                               'bulk_density', 'C_to_N_ratio', 'maturity']]

casp4_trial_condition.rename(columns={'trial_ID': 'trial_id',
                                      'temp_F': 'temperature',
                                      'moisture_%': 'moisture',
                                      'C_to_N_ratio': 'c_to_n'}, 
                                      inplace=True)

# Only keep 1 row because all data are repetitive
casp4_trial_condition = casp4_trial_condition.head(1)
casp4_trial_condition['trial_id'] = 'CASP004-01'

casp4_trial_condition

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  casp4_trial_condition.rename(columns={'trial_ID': 'trial_id',


Unnamed: 0,trial_id,temperature,weather,moisture,bulk_density,c_to_n,maturity
0,CASP004-01,65-80,Windy,51.0,754 lbs/CY (manual) \n0.29 g/cc (lab),34.0,0.0


Note: Process bulk density first because it has two values in one column and has unit inside

In [50]:
bulk_density = casp4_trial_condition[['trial_id', 'bulk_density']]

trial_stages = ['Loading', 'First Removal', 'Second Removal']

# Function to process each row and create detailed rows for each trial stage
def process_row(row, stages):
    # Splitting the bulk_density into two parts: manual and lab
    parts = row['bulk_density'].split('\n')
    manual_part, lab_part = parts[0], parts[1]
    
    # Extracting value and unit for manual
    manual_value, manual_unit = manual_part.split(' ')[0], ' '.join(manual_part.split(' ')[1:-1])
    manual_method = "field"
    
    # Extracting value and unit for lab
    lab_value, lab_unit = lab_part.split(' ')[0], ' '.join(lab_part.split(' ')[1:-1])
    lab_method = "lab"
    
    new_rows = []
    for stage in stages:
        # Creating two new rows for each stage
        new_rows.extend([
            {'trial_id': row['trial_id'], 'condition': 'bulk_density', 
             'value': manual_value, 'method': manual_method, 'unit': manual_unit, 
             'trial_stage': stage, 'day': np.nan},
            {'trial_id': row['trial_id'], 'condition': 'bulk_density', 
             'value': lab_value, 'method': lab_method, 'unit': lab_unit, 
             'trial_stage': stage, 'day': np.nan}
        ])
    
    return new_rows

# Applying the function and expanding the DataFrame to include stages
expanded_rows = process_row(bulk_density.iloc[0], trial_stages)
expanded_bulk_density = pd.DataFrame(expanded_rows)

expanded_bulk_density

Unnamed: 0,trial_id,condition,value,method,unit,trial_stage,day
0,CASP004-01,bulk_density,754.0,field,lbs/CY (manual),Loading,
1,CASP004-01,bulk_density,0.29,lab,g/cc,Loading,
2,CASP004-01,bulk_density,754.0,field,lbs/CY (manual),First Removal,
3,CASP004-01,bulk_density,0.29,lab,g/cc,First Removal,
4,CASP004-01,bulk_density,754.0,field,lbs/CY (manual),Second Removal,
5,CASP004-01,bulk_density,0.29,lab,g/cc,Second Removal,


In [51]:
other_conditions = casp4_trial_condition.drop('bulk_density', axis=1)
other_conditions_long = pd.melt(other_conditions, id_vars=['trial_id'], var_name='condition', value_name='value')

def assign_method(condition):
    if condition in ['maturity', 'c_to_n']:
        return 'lab'
    elif condition in ['temperature', 'weather']:
        return 'field'
    else:
        return pd.NA

# Define a function to assign units based on condition
def assign_unit(condition):
    if condition == 'temperature':
        return 'F'
    else:
        return pd.NA

# Apply the functions to assign method and unit
other_conditions_long['method'] = other_conditions_long['condition'].apply(assign_method)
other_conditions_long['unit'] = other_conditions_long['condition'].apply(assign_unit)

# Iterate through each condition and trial stage
trial_stages = ['Loading', 'First Removal', 'Second Removal']
new_rows = []

for _, row in other_conditions_long.iterrows():
    for stage in trial_stages:
        new_row = row.copy()
        new_row['trial_stage'] = stage
        new_row['day'] = np.nan
        new_rows.append(new_row)

result_df = pd.DataFrame(new_rows).reset_index(drop=True)

result_df.head()

Unnamed: 0,trial_id,condition,value,method,unit,trial_stage,day
0,CASP004-01,temperature,65-80,field,F,Loading,
1,CASP004-01,temperature,65-80,field,F,First Removal,
2,CASP004-01,temperature,65-80,field,F,Second Removal,
3,CASP004-01,weather,Windy,field,,Loading,
4,CASP004-01,weather,Windy,field,,First Removal,


## 4. Combine condition tables for 10 trials and CASP004-01

In [52]:
combined_10_trials = pd.concat([temperature_long, merged_weekly_condition_long, BD_field, lab_data_all_stages_long])

# Add trail_id based on facility name
id_matching = facility_id[['public_trial_id', 'trial_facility_name']]
combined_10_trials = pd.merge(combined_10_trials, id_matching, left_on='facility_name', right_on='trial_facility_name', how='left')

combined_10_trials = combined_10_trials.drop(['facility_name', 'trial_facility_name'], axis=1)
combined_10_trials.rename(columns={'public_trial_id': 'trial_id'}, inplace=True)
new_order = ['trial_id', 'trial_stage', 'day', 'condition', 'value', 'unit', 'method']
combined_10_trials = combined_10_trials[new_order]

combined_10_trials

Unnamed: 0,trial_id,trial_stage,day,condition,value,unit,method
0,WR004-01,,1,temperature,112.8,F,field
1,WR004-01,,2,temperature,117.0,F,field
2,WR004-01,,3,temperature,125.2,F,field
3,WR004-01,,4,temperature,128.0,F,field
4,WR004-01,,5,temperature,,F,field
...,...,...,...,...,...,...,...
1695,CASP006-01,Second Removal,,overall_notes,,,lab
1696,CASP004-02,Second Removal,,overall_notes,,,lab
1697,ASP001-01,Second Removal,,overall_notes,,,lab
1698,EASP003-01,Second Removal,,overall_notes,,,lab


In [53]:
# include CASP004-01
combined_all = pd.concat([combined_10_trials, expanded_bulk_density, result_df])
combined_all.to_csv('/Users/cecilia/Documents/GitHub/2024-winter-compostable/data/finalized_datasets/trial_conditions.csv', index=False)

combined_all

OSError: Cannot save file into a non-existent directory: '/Users/cecilia/Documents/GitHub/2024-winter-compostable/data/finalized_datasets'

## 5. Add facility table

In [None]:
facility = facility_id[['public_trial_id']]
facility['facility_name'] = facility['public_trial_id'].str.split('-').str[0]
facility = facility[['facility_name']]

# manually add 5 facilities
new_rows = pd.DataFrame({'facility_name': ['AD001', 
                                           'WR001',
                                           'CASP001',
                                           'CASP003',
                                           'WR003']})
facility = pd.concat([facility, new_rows], ignore_index=True)

facility['facility_id'] = range(1, len(facility) + 1)
facility = facility[['facility_id', 'facility_name']]

facility.to_csv('/Users/cecilia/Desktop/2024-winter-compostable/data/finalized_datasets/facilities.csv', index=False)

facility
# Note: no need to add CASP004 because it is already included.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facility['facility_name'] = facility['public_trial_id'].str.split('-').str[0]


Unnamed: 0,facility_id,facility_name
0,1,WR004
1,2,CASP005
2,3,EASP001
3,4,IV002
4,5,EASP002
5,6,CASP006
6,7,CASP004
7,8,ASP001
8,9,EASP003
9,10,WR005


## 6. Add trial table

In [57]:
trial = pd.merge(trial_duration, id_matching, left_on='facility_designation', right_on='trial_facility_name', how='left')
trial = trial.drop(['facility_designation', 'trial_facility_name'], axis=1)
trial.rename(columns={'public_trial_id': 'trial_name'}, inplace=True)
trial['facility_name'] = trial['trial_name'].str.split('-').str[0]
trial['trial_id'] = range(1, len(trial) + 1)
trial = pd.merge(trial, facility, on='facility_name', how='left')


new_order = ['trial_id', 'trial_name', 'facility_id'] + [col for col in trial.columns if col not in ['trial_id', 'trial_name', 'facility_id']]
trial = trial[new_order]

trial = trial.drop('facility_name', axis=1)

trial

Unnamed: 0,trial_id,trial_name,facility_id,midpoint_analysis,endpoint_analysis_trial_length,typical_process_from_start_to_sale,%_trial_length_to_typical_process
0,1,WR004-01,1,43,67,85.0,0.788235
1,2,CASP005-01,2,45,71,75.0,0.946667
2,3,EASP001-01,3,45,72,72.5,0.993103
3,4,IV002-01,4,19,49,50.0,0.98
4,5,EASP002-01,5,48,93,140.0,0.664286
5,6,CASP006-01,6,44,90,100.0,0.9
6,7,CASP004-02,7,44,68,62.0,1.096774
7,8,ASP001-01,8,45,90,180.0,0.5
8,9,EASP003-01,9,45,90,120.0,0.75
9,10,WR005-01,10,46,95,180.0,0.527778


In [58]:
# Add CASO004-01
new_row_data = {
    'trial_id': [11],
    'trial_name': ['CASP004-01'], 
    'facility_id': [7],  
    'midpoint_analysis': [pd.NA], 
    'endpoint_analysis_trial_length': [pd.NA], 
    'typical_process_from_start_to_sale': [pd.NA],  
    '%_trial_length_to_typical_process': [pd.NA] 
}
new_row_df = pd.DataFrame(new_row_data)

trial = pd.concat([trial, new_row_df], ignore_index=True)

trial.to_csv('/Users/cecilia/Desktop/2024-winter-compostable/data/finalized_datasets/trials.csv', index=False)

trial

Unnamed: 0,trial_id,trial_name,facility_id,midpoint_analysis,endpoint_analysis_trial_length,typical_process_from_start_to_sale,%_trial_length_to_typical_process
0,1,WR004-01,1,43.0,67.0,85.0,0.788235
1,2,CASP005-01,2,45.0,71.0,75.0,0.946667
2,3,EASP001-01,3,45.0,72.0,72.5,0.993103
3,4,IV002-01,4,19.0,49.0,50.0,0.98
4,5,EASP002-01,5,48.0,93.0,140.0,0.664286
5,6,CASP006-01,6,44.0,90.0,100.0,0.9
6,7,CASP004-02,7,44.0,68.0,62.0,1.096774
7,8,ASP001-01,8,45.0,90.0,180.0,0.5
8,9,EASP003-01,9,45.0,90.0,120.0,0.75
9,10,WR005-01,10,46.0,95.0,180.0,0.527778


## 7. Other condition table (not included)

##### more detailed lab data on the last stage

In [None]:
lab_data_detailed_end = all_sheets['Lab-Full-SecondRemoval']
lab_data_detailed_end.columns = [clean_column_names(col) for col in lab_data_detailed_end.columns]
lab_data_detailed_end=lab_data_detailed_end.rename(columns = {'facility_number_new':'facility_name',
                                                              'pass_per_fail':'pass_vs_fail',
                                                              '3_per_4__%_passing':'3_divide_4_%_passing',
                                                              '5_per_8__%_passing':'5_divide_8_%_passing',
                                                              '1_per_2__%_passing':'1_divide_2_%_passing',
                                                              '3_per_8__%_passing':'3_divide_8_%_passing',
                                                              '1_per_4__%_passing':'1_divide_4_%_passing'
                                                              }
                                                    )

# change 'end of curing' in the original data set into 'second removal' in order to merge using trial stage later
lab_data_detailed_end['trial_stage'] = lab_data_detailed_end['trial_stage'].replace('End of Curing', 'Second Removal')                       

pd.set_option('display.max_columns', None)
lab_data_detailed_end.head()

Unnamed: 0,facility_name,trial_stage,total_n_as_rcvd,total_n_dry_wt,organic_c_as_rcvd,organic_c_dry_wt,c_to_n_dry_wt,bulk_density_as_rcvd,solids_as_rcvd,moisture_as_rcvd,ph_as_rcvd,ph_dry_wt,stability_mg_co2c_per_g_om_per_day,stability_rating,emergence_%,vigor_%,maturity_rating,fecal_coliforms_mpn_per_g_dw,pass_per__fail,organic_matter_%dw,ash_%dw,ammonium_n_mg_per_kg_dw,nitraten_mg_per_kg_dw,chloride_mg_per_kg_dw,sulfates_mg_per_kg_dw,caco3_lbs_per_t,phosphorus_%dw,p2o5_%dw,potassium_%dw,k2o_%dw,calcium_%dw,magnesium_%dw,sodium_%dw,sulfur_%dw,boron_mg_per_kg_dw,zinc_mg_per_kg_dw,manganese_mg_per_kg_dw,copper__mg_per_kg_dw,iron_mg_per_kg_dw,arsenic_mg_per_kg_dw,cadmium_mg_per_kg_dw,chromium_mg_per_kg_dw,cobalt_mg_per_kg_dw,mercury_mg_per_kg_dw,molybdenum_mg_per_kg_dw,nickel_mg_per_kg_dw,lead_mg_per_kg_dw,selenium_mg_per_kg_dw,3__%_passing,2__%_passing,1__%_passing,3_divide_4_%_passing,5_divide_8_%_passing,1_divide_2_%_passing,3_divide_8_%_passing,1_divide_4_%_passing,total_plastic_%_by_wt,film_plastic_%_by_wt,glass_%_by_wt,metal_%_by_wt,sharps_%_by_wt,total_inerts_%_by_wt
0,Facility 1,Second Removal,0.432702,0.73,6.875806,11.6,15.890411,1474.178166,59.274194,40.725806,7.4,,0.117282,Very Stable,100,100,Mature,388.027211,Fail,22.929724,77.070276,386.340136,4.04898,110,13.2,190,0.1031,0.236099,0.2573,0.30876,3.465,1.63,0.0247,0.065,1.0,74.7,460,23.8,17100,3.1,<MDL,10.4,5.0,0.02,0.5,9.6,5,0.7,100,100.0,100.0,100.0,98.035139,86.225179,72.107288,48.913608,0.0,0.0,0.0,0,0,0.0
1,Facility 2,Second Removal,0.83062,1.23,18.300648,27.1,22.03252,1203.257135,67.530065,32.469935,7.1,,2.256662,Stable,100,100,Mature,34.058904,Pass,42.407775,57.592225,503.479452,23.989315,2817,123.2,154,0.2498,0.572042,0.3524,0.42288,16.77,0.3964,0.2969,0.149,43.3,28.8,117,20.7,4721,<MDL,0.2,7.1,0.8,0.01,0.8,1.7,<MDL,3.4,100,100.0,100.0,100.0,99.68652,98.798328,97.405085,90.613027,0.0,0.0,0.0,0,0,0.0
2,Facility 3,Second Removal,1.11663,1.84,20.512012,33.8,18.369565,842.778,60.686427,39.313573,7.3,,1.885598,Stable,93,100,Mature,23069.40874,Fail,59.871226,40.128774,374.053985,14.006427,813,93.2,134,0.2177,0.498533,0.1965,0.2358,6.944,0.3998,0.0973,0.2486,1.6,57.9,289,23.6,4695,3.6,0.5,8.7,1.0,0.02,0.6,6.5,4.4,4.1,100,97.960159,92.972112,87.840637,83.49004,75.23506,68.031873,51.378486,0.0,0.0,0.0,0,0,0.0
3,Facility 4,Second Removal,1.323821,2.2,26.957816,44.8,20.363636,656.600678,60.173697,39.826303,6.5,,3.139574,Stable,93,100,Mature,598.268041,Fail,83.246655,16.753345,789.381443,16.452371,3426,88.0,74,0.2535,0.580515,0.7345,0.8814,2.369,0.2471,0.2645,0.1566,17.2,51.3,113,34.2,2213,2.2,0.4,4.3,0.3,0.02,0.7,2.1,1.6,2.6,100,100.0,100.0,100.0,100.0,100.0,100.0,87.069409,0.0,0.0,0.0,0,0,0.0
4,Facility 5,Second Removal,0.45,0.96,9.4,19.9,21.0,827.0,47.1,52.9,8.4,,1.607838,VERY STABLE,100,100,Mature,3800.0,Fail,36.1,63.9,1644.9,9.8,1965,47.2,76,0.21,0.47,0.71,0.85,1.78,0.51,0.14,0.12,9.7,76.6,303,19.3,13460,5.3,<MDL,24.7,4.6,0.03,1.5,13.8,16.9,1.0,100,94.14,74.51,66.21,61.5,48.64,40.6,26.44,0.0,0.0,0.0,0,0,0.0
