This Jupyter Notebook will translate the output of xCP from a hdf5 file to a .csv file for easier analysis. Input and output paths for this script are located in cell 1 and should be changed for each user. By default, the script will write the columns ApplicationDates, ApplicationDayMonth, FeatureID, ApplicationRate(g/ha), AppliedPPP, AppliedArea(ha), and TechnologyDriftReductions. Applied mass(g) can also be calculated by modifying cell 5.

Version 2.0 - 11/15/2024 Added feature LULC to output

Version 1.0 - 5/21/2024

Input and output file paths - change these

In [None]:
# Input hdf file location
xcrop_arrdat_path = r'C:\path\to\arr.dat'
# Output file location and name
output_path = r'C:\path\to\xCropProtection_output.csv'

In [19]:
import h5py
import datetime
import pandas
import geopandas

In [20]:
# Check that all subgroups are h5py datasets
def checkInstance(datasets):
    for dataset in datasets:
        if not isinstance(dataset, h5py.Dataset):
            print(dataset, "is not a h5py dataset.")
            return False
    return True

try:
    arr_file = h5py.File(xcrop_arrdat_path, 'r')
except FileNotFoundError:
    print("The file", xcrop_arrdat_path, "could not be accessed")

dataset = arr_file['xCropProtection']
landscape_dataset = arr_file['LandscapeScenario']

# Get data for subgroups
application_dates_subgroup = dataset['ApplicationDates']
application_rates_subgroup = dataset['ApplicationRates']
application_areas = dataset['AppliedAreas']
applied_features = dataset['AppliedFields']
application_PPP = dataset['AppliedPPP']
xcrop_file_path = dataset['xCropProtectionFilePath']
drift_reduction = dataset['TechnologyDriftReductions']

feature_ids = landscape_dataset['FeatureIds']
feature_type_ids = landscape_dataset['FeatureTypeIds']
epsg = landscape_dataset['EPSG']

# Check that subgroups are h5py datasets
if not checkInstance([application_dates_subgroup, application_rates_subgroup, application_PPP, xcrop_file_path, drift_reduction]):
    print("Error retrieving subgroup data.")
    arr_file.close()
    quit

In [21]:
application_dates_data = application_dates_subgroup[:]
application_rates_data = application_rates_subgroup[:]
applied_features_data = applied_features[:]
application_PPP_data = application_PPP[:]
application_areas_data = application_areas[:]
drift_reduction_data = drift_reduction[:]
epsg_data = epsg[()]
feature_ids_data = feature_ids[:]
feature_type_ids_data = feature_type_ids[:]

# Create dictionary of feature ids and their lulc type
feature_id_lulc_dict = {}
for index, fid in enumerate(feature_ids_data):
    feature_id_lulc_dict[fid] = feature_type_ids_data[index]

feature_lulc = [feature_id_lulc_dict.get(x) for x in applied_features_data]

application_dates = [datetime.date.fromordinal(x) for x in application_dates_data]
application_dates_day_month = [datetime.date.fromordinal(x).strftime('%d-%B') for x in application_dates_data]

# Convert application area arrays to bytes for geometry creation
app_areas_bytes = [x.tobytes() for x in application_areas] 

field_geometries = geopandas.GeoDataFrame(
    geometry=geopandas.GeoSeries.from_wkb(app_areas_bytes),
    crs="EPSG:" + str(epsg_data)
).to_crs(crs="EPSG:4326")
field_geometries["field_idx"] = field_geometries.reset_index().index

# Project geometry and calculate area
geom_project = field_geometries.to_crs(crs="EPSG:" + str(epsg_data))
geom_project_area_m = geom_project.area
geom_project_area_ha = geom_project_area_m / 10000

# Convert from bytes to string
decode_PPP = [x.decode() for x in application_PPP_data]

In [22]:
feature_id_type_dict = {}
for i, feature_id in enumerate(feature_ids_data):
    feature_id_type_dict[feature_id] = feature_type_ids_data[i]

Write data to csv

In [23]:
dfs = []
# Comment out any line which is not desired in the output csv
dfs.append(pandas.DataFrame(application_dates, columns=["ApplicationDates"]))
dfs.append(pandas.DataFrame(application_dates_day_month, columns=["ApplicationDayMonth"]))
dfs.append(pandas.DataFrame(applied_features_data, columns=["FeatureID"]))
dfs.append(pandas.DataFrame([feature_id_type_dict.get(x) for x in applied_features_data], columns=["FeatureLULC"]))
dfs.append(pandas.DataFrame(application_rates_data, columns=["ApplicationRates(g/ha)"]))
dfs.append(pandas.DataFrame(decode_PPP, columns=["AppliedPPP"]))
dfs.append(pandas.DataFrame(geom_project_area_ha, columns=["AppliedArea(ha)"]))
#dfs.append(pandas.DataFrame(application_rates_data * geom_project_area_ha, columns=["AppliedMass(g)"]))
dfs.append(pandas.DataFrame(drift_reduction_data, columns=["TechnologyDriftReductions"]))
merged_df = pandas.concat(dfs, axis=1)        
merged_df.to_csv(output_path, index=False)   
arr_file.close()