### Initial setup

In [149]:
import os

print("Current Working Directory:", os.getcwd())
os.chdir('/Users/cecilia/Documents/GitHub/2024-winter-compostable')

Current Working Directory: /Users/cecilia/Documents/GitHub/2024-winter-compostable


In [164]:
import pandas as pd 

# load the data for sheet 2 and 3 (masterfile.csv has a cleaned and merged version of them)
masterfile = pd.read_csv('data/CASP004-01/masterfile.csv')

# load the data for sheet 1 
facility = pd.read_excel('data/CASP004-01/raw/CASP004-01 - Results Pre-Processed for Analysis from PDF Tables.xlsx', sheet_name=0)


In [165]:
# lowercase all column names in masterfile
masterfile.columns = [col.lower() for col in masterfile.columns]

# rename the facility_ID to match with facility table
masterfile.rename(columns={'org_id': 'facility_id'}, inplace=True)

### Write into csv files

**1. Format facility**

- primary key: `facility_id`
- connect with observation table through `facility_id`
- connect with all other tables through observation table

In [152]:
# only keep the first 19 columns because others are repetitive in the obervation table
facility = facility.iloc[:, :20]

# clean colnames for facility
def clean_column_names(column):
    column = column.lower()  
    column = column.replace(' ', '_') 
    column = column.replace('/', '_per_') 
    column = column.replace(':', '_to_') 
    for char in ['(', ')', '*', '-', '&', '?']:
        column = column.replace(char, '') 
    return column

facility.columns = [clean_column_names(col) for col in facility.columns]

facility.rename(columns={'how_ofen': 'temp_checked_frequency', 
                         'how_often.1': 'moisture_checked_frequency'}, 
                         inplace=True)

# reorder cols to make facility_id appears first
columns = facility.columns.tolist() 
columns = [columns[1], columns[0]] + columns[2:] 
facility = facility[columns]

facility.head()

Unnamed: 0,facility_id,facility_name,facility_type,volume_per_year,volume_per_year_unit,calculated_volume_per_year_range,min_pfrp_duration_days,max_pfrp_duration_days,min_active_composting,max_active_composting,min_curing,max_curing,comments_on_process__duration,how_is_temp_checked,how_often,moisture,moisture_checked_frequency,c_to_n,maturity,moisture_control
0,44547,CASP004-01,,90000,tons,,22,30,22,30,30,40,Covered Aerated Static Pile (CASP) with positi...,Instrumentation,"Daily during PFRP, weekly thereafter",Sprinkling,"Entering CASP, or between CASP and curing",Lab,Lab,Squeeze test


In [159]:
facility.to_csv('data/CASP004-01/processed/facility.csv', index=False)

**2. Format trial**

- primary key: `trial_id`
- connect with facility table through `facility_id`
- connect with observation table through `trial_id`
- connect with all other tables through observation table

In [166]:
trial_columns_to_keep = ['trial_id', 'facility_id', 'trial_stage', 'temp_f', 'weather', 'moisture_%', 
                        'bulk_density', 'ph', 'c_to_n_ratio', 'maturity', 'trial_notes']

trial = masterfile[trial_columns_to_keep]

# remove duplicates according to trail stages (only keep one row for start, one row for intervel and one row for end)
trial = trial.drop_duplicates(subset='trial_stage')

trial.head()


Unnamed: 0,trial_id,facility_id,trial_stage,temp_f,weather,moisture_%,bulk_density,ph,c_to_n_ratio,maturity,trial_notes
0,44547-01-21,44547.0,Start,65-80,Windy,51.0,754 lbs/CY (manual) \n0.29 g/cc (lab),4.85,34.0,0.0,Moisture assessed via oven drying. Lab feedsto...
6,44547-01-21,44547.0,Interval,65-80,Windy,51.0,754 lbs/CY (manual) \n0.29 g/cc (lab),4.85,34.0,0.0,Moisture assessed via oven drying. Lab feedsto...
12,44547-01-21,44547.0,End,65-80,Windy,51.0,754 lbs/CY (manual) \n0.29 g/cc (lab),4.85,34.0,0.0,Moisture assessed via oven drying. Lab feedsto...


In [162]:
trial.to_csv('data/CASP004-01/processed/trial.csv', index=False)

**3. Format bag**

- primary key: `bag_id`
- connect with facility table through trial table
- connect with trial table through `trial_id`
- connect with item table through observation table
- connect with observation table through `bag_id`

In [167]:
def modify_bag_id(df):
    # change the bag_id into trail_id_bag_id to make it unique across the data set
    # (There might be cases that for each trial, the bag_IDs are all A/B/C-X)
    df['bag_id'] = df.apply(lambda row: str(row['trial_id']) + '_' + str(row['bag_id']), axis=1)

    # replace other characters with '_'
    chars_to_replace = ['/', ':', '(', ')', '*', '-', '&', '?']
    for char in chars_to_replace:
        df['bag_id'] = df['bag_id'].str.replace(char, '_', regex=False)

    return df

masterfile = modify_bag_id(masterfile)

bag_columns_to_keep = ['bag_id', 'trial_id', 'bag_color', 'bag_set', 'bag_content', 'bag_type', 'bag_placement']

bag = masterfile[bag_columns_to_keep].drop_duplicates(subset='bag_id')

bag.head()

Unnamed: 0,bag_id,trial_id,bag_color,bag_set,bag_content,bag_type,bag_placement
0,44547_01_21_A_1,44547-01-21,Green,A,Baseline,Standard,Top depth
1,44547_01_21_A_2,44547-01-21,Green,A,Baseline,Standard,Top depth
2,44547_01_21_A_3,44547-01-21,Green,A,Baseline,Standard,Top depth
3,44547_01_21_A_4,44547-01-21,Green,A,Baseline,Standard,Top depth
4,44547_01_21_A_5,44547-01-21,Green,A,Baseline,Standard,Bottom depth


In [168]:
bag.to_csv('data/CASP004-01/processed/bag.csv', index=False)

**4. Format item**

- primary key: `item_id`
- connect with observation table through `item_id`
- connect with all other tables through observation table

In [170]:
item_columns_to_keep = ['product_name', 'product_brand', 'product_material', 'material_i', 'material_ii',
                        'material_iii', 'certification', 'product_size', 'product_weight_init_g']

item = masterfile[item_columns_to_keep].drop_duplicates(subset='product_name')

# generate item_id as the primary key for item table
item['item_id'] = range(1, len(item) + 1)

# reorder cols to make item_id appears first
columns = ['item_id'] + [col for col in item.columns if col != 'item_id']
item = item[columns]

item.head()

Unnamed: 0,item_id,product_name,product_brand,product_material,material_i,material_ii,material_iii,certification,product_size,product_weight_init_g
0,1,12 oz Hot cup / Soup bowl,BÉSICS®,"Paper, PLA lining, adhesive, ink",Fiber,Lined fiber,PLA lined Paper,BPI,"3’’ diameter, 2.5’’ H",8.11
36,2,Hot cup lid,BÉSICS®,CPLA based bio-polymer,Biopolymer,Rigid biopolymer,CPLA,BPI,"3.5’’ diameter, 0.5’’ H",3.57
72,3,16 oz PLA cold cup,Greenware®,"PLA, ink",Biopolymer,Rigid biopolymer,PLA,BPI,"2.5’’ diameter at base, 4’’ diameter at mouth,...",14.58
108,4,Cutlery,BÉSICS®,"CPLA, talc",Biopolymer,Rigid biopolymer,CPLA,BPI,6.5’’ x 1.53’’ (max) x 3/8’’,4.75
144,5,"PLA-lined fibre bowl, white",BÉSICS®,"PLA, bagasse (sugarcane fibre)",,,,,5.5’’ x 5.5’’ x 2’’,10.52


In [172]:
item.to_csv('data/CASP004-01/processed/item.csv', index=False)

5. Format observation
- primary key: `obs_id`
- connect with facility table through `facility_id`
- connect with trial table through `trial_id`
- connect with bag table through `bag_id`
- connect with item table through `item_id`

In [157]:
# put back item_id in the masterfile
masterfile = pd.merge(masterfile, item[['item_id', 'product_name']], on='product_name', how='left')

obs_columns_to_keep = ['facility_id', 'trial_id', 'bag_id', 'item_id', 
                        'trial_stage', 'weight1', 'weight2', 'weight3', 'mean_weight_final_g',
                        '%_not_decomposed', '%_decomposed', 'bag_intact', 'photo_available', 
                        'notes', 'outlier_alert']

observation = masterfile[obs_columns_to_keep]

# generate obs_id as the primary key for observation table
observation['obs_id'] = range(1, len(observation) + 1)

# reorder columns to make obs_id appear first
columns2 = ['obs_id'] + [col for col in observation.columns if col != 'obs_id']
observation = observation[columns2]

observation.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  observation['obs_id'] = range(1, len(observation) + 1)


Unnamed: 0,obs_id,facility_id,trial_id,bag_id,item_id,trial_stage,weight1,weight2,weight3,mean_weight_final_g,%_not_decomposed,%_decomposed,bag_intact,photo_available,notes,outlier_alert
0,1,44547.0,44547-01-21,44547_01_21_A_1,1,Start,8.12,8.1,8.12,8.11,100.0,0.0,True,True,,False
1,2,44547.0,44547-01-21,44547_01_21_A_2,1,Start,8.12,8.1,8.12,8.11,100.0,0.0,True,True,,False
2,3,44547.0,44547-01-21,44547_01_21_A_3,1,Start,8.12,8.1,8.12,8.11,100.0,0.0,True,True,,False
3,4,44547.0,44547-01-21,44547_01_21_A_4,1,Start,8.12,8.1,8.12,8.11,100.0,0.0,True,True,,False
4,5,44547.0,44547-01-21,44547_01_21_A_5,1,Start,8.12,8.1,8.12,8.11,100.0,0.0,True,True,,False


In [173]:
observation.to_csv('data/CASP004-01/processed/observation.csv', index=False)