In [1]:
import pandas as pd 

# Load the raw files (note they will have to be converted
# to .csv manually --> maybe we can do a bit of research on this later
# Also depends on how coherent the raw format is across different facilities

sheet_2 = pd.read_csv('../data/CASP004-01/raw/Test summary per bag & stage.csv')
print(sheet_2.shape)
sheet_3 = pd.read_csv('../data/CASP004-01/raw/Bag&Product Setup.csv')
print(sheet_3.shape)

# Merge on 'Product Name'
merged = pd.merge(sheet_3, sheet_2, on='Product Name', how='outer')
print(merged.shape)

# Drop Junk
columns_to_remove = [col for col in merged.columns if 'Unnamed' in col]
merged.drop(columns_to_remove, axis=1, inplace=True)

columns_to_remove = ['Bag Colour_x', 'Bag Set', 'Replicates', 'Weight 1_x', 'Description',
                     'Weight 2_x', 'Weight 3_x', 'Weight units', 'Org Id_y', 'Trial Id_y',
                     'Temp Units', 'Density Units', 'Weight Units']
merged.drop(columns_to_remove, axis=1, inplace=True)

# Rename columns
new_variable_names = {'Org Id_x': 'org_ID', 
                      'Trial Id_x': 'trial_ID', 
                      'TrialBagType': 'bag_content',
                      'Product Name': 'product_name',
                      'Brand/Manufacturer': 'product_brand', # later: correct misspelled names
                      'Composition': 'product_material', # later: pull individual materials (material_1, material_2, material_3)?
                      'Certifications': 'certification',
                      'Size': 'product_size',
                      'Weight (average)': 'product_weight_init_g',
                      'Bag Colour_y': 'bag_color',
                      'Bag Id': 'bag_ID',
                      'Stage': 'trial_stage',
                      'Weather': 'weather',
                      'Moisture': 'moisture_%',
                      'Temp': 'temp_F',
                      'Bulk Density': 'bulk_density', 
                      'C:N': 'C_to_N_ratio',
                      'Maturity': 'maturity',
                      'Notes': 'trial_notes',
                      'Bag Type': 'bag_type',
                      'Bag Intact? (Y/N)': 'bag_intact',
                      'Bag Notes': 'bag_placement',
                      'Photo': 'photo_available',
                      'Fragment Size': 'fragment_size',
                      'Weight 1_y': 'weight1',
                      'Weight 2_y': 'weight2',
                      'Weight 3_y': 'weight3',
                      'Product Weight Avg': 'mean_weight_final_g',
                      'Product Notes': 'notes'}
merged = merged.rename(columns=new_variable_names)

# Enforce types
merged = merged.astype({'product_weight_init_g': 'float',
                        'pH': 'float',
                        'C_to_N_ratio': 'float',
                        'maturity': 'float',
                        'weight1': 'float',
                        'weight2': 'float',
                        'weight3': 'float',
                        'mean_weight_final_g': 'float'})

# Reorder columns
new_order = ['org_ID', 'trial_ID', 'bag_color', 'bag_ID','bag_content', 'bag_type', 'bag_placement', 'trial_stage', #facility
             'product_name', 'product_brand', 'product_material', 'certification', 'product_size', 'product_weight_init_g', #product
             'temp_F', 'weather', 'moisture_%', 'bulk_density', 'pH', 'C_to_N_ratio', 'maturity', 'trial_notes', #conditions
             'bag_intact', 'fragment_size', 'photo_available', 'weight1', 'weight2', 'weight3', 'mean_weight_final_g', 'notes'] # results
merged = merged[new_order]

# Calculate missing means
merged[['weight1', 'weight2', 'weight3']] = merged[['weight1', 'weight2', 'weight3']].apply(
    pd.to_numeric, errors='coerce'
).fillna(0)
merged['mean_weight_final_g'] = merged[['weight1', 'weight2', 'weight3']].mean(axis=1).round(2)

# Standardize variables
merged['moisture_%'] = merged['moisture_%'].str.replace('%', '', regex=False) # remove '%' from moisture_%
merged['moisture_%'] = pd.to_numeric(merged['moisture_%'], errors='coerce')

merged['bag_intact'] = merged['bag_intact'].map({'Y': True, 'N': False})
merged['photo_available'] = merged['photo_available'].map({'Y': True, 'N': False})

# Find % of composed and % of not composed
merged['%_not_decomposed'] = ((merged['mean_weight_final_g'] / merged['product_weight_init_g'])*100).round(2)
merged['%_decomposed'] = (100 - merged['%_not_decomposed']).round(2)
print(merged.shape[0])

# Create an alert variable for potential outiars
merged['outlier_alert'] = merged['mean_weight_final_g'] > merged ['product_weight_init_g']

### Note: it would make sense for the Compostable people to create a column 'bag_missing' and simply answer Y/N
### rather than placing this into notes because it cannot really be generalized from notes to all input raw data files. 

# Crop the df
last_valid_index = merged['org_ID'].last_valid_index()
merged = merged.iloc[:last_valid_index + 1]
print(merged.shape)

# save csv
merged.to_csv('../data/CASP004-01/masterfile.csv', index=False)

(999, 27)
(999, 27)
(558549, 53)
558549
(1044, 33)
