### Import libraries

In [1]:
from osgeo import gdal
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from osgeo import ogr
import geopandas as gpd
import openpyxl
import seaborn as sns
import json
import os
import re
from datetime import datetime

### Variables and paths

In [2]:
# Define the year variable of the data used
year = 2021
# Day of the year IrriWatch data was selected
dayofyear = 30102021

# Percent threshold for cleaning the AVR incomplete sensor data
percent_thresh = 75

# Assumed Harvest Index value for potatoes
HI = 0.75

# Dictionary for Dry Matter content per potato variety
dic_DM = {
  "Fontane": 0.23,
  "Ivory Russet": 0.21,
  "Challenger": 0.221,
    "Zorba" : 0.233,
    'Edison' : 0.214
}

# Input files paths
# Actual combined harvester potato field data from AVR connect portal
AVR_field = "/home/ucfaya1@ad.ucl.ac.uk/Downloads/HI_Data_yara_new/code_data/AVR_data/allFields_AVR_{}.xlsx".format(year)

# IW field level data
all_fields = "/home/ucfaya1@ad.ucl.ac.uk/Downloads/HI_Data_yara_new/code_data/IW_Data/allFields_IW_{}.xlsx".format(dayofyear)

# Geometry shapefile path
geometry_fields = "/home/ucfaya1@ad.ucl.ac.uk/Downloads/HI_Data_yara_new/allFields_boundaries_IW/field_basics/field_basics.shp"

# Output files paths
csv_output_file_name = 'DMP_AVR_geometry{}.csv'.format(dayofyear)
csv_output_file_path = "/home/ucfaya1@ad.ucl.ac.uk/Downloads/HI_Data_yara_new/code_data/save_code_data/" + csv_output_file_name

## AVR actual yield dataset

In [3]:
# Read the Excel file into a DataFrame
df_AVR = pd.read_excel(AVR_field)
selected_columns = df_AVR.loc[:,['FieldName','Variety','StartDate','AmountYieldTonsDone','AmountHectareDonePercentage','AverageTonsHa']]
df_AVR_Selec=df_AVR.drop(df_AVR.columns.difference(selected_columns.columns), axis=1)

# Apply threshold to filter out remove AVR incomplete field data
df_AVR_cleaned = df_AVR_Selec.drop(df_AVR_Selec[df_AVR_Selec['AmountHectareDonePercentage'] <= percent_thresh].index)

# Remove numbers and () to match the field names
pattern = r'[0-9,()]+'
df_AVR_cleaned['FieldName'] = df_AVR_cleaned['FieldName'].str.replace(pattern, '',regex=True)

# Remove space from the end of the string in field name
df_AVR_cleaned['FieldName'] = df_AVR_cleaned['FieldName'].str.strip()

# Replace FO values in varieties with fontane
df_AVR_cleaned['Variety'] = df_AVR_cleaned['Variety'].replace('FO','Fontane')

## IW all_fields data

In [4]:
# Read the Excel file into a df
df_all_fields = pd.read_excel(all_fields)
df_all_fields = df_all_fields.rename(columns={'Name': 'FieldName'})
selected_col = df_all_fields.loc[:,['FieldName','Area','Dry Matter Production Cumulative (kg/ha)']]
df_DMP=df_all_fields.drop(df_all_fields.columns.difference(selected_col.columns), axis=1)

# Remove numbers and () to match the field names
pattern = r'[0-9,()]+'
df_DMP['FieldName'] = df_DMP['FieldName'].str.replace(pattern, '',regex=True)

# Remove spaces from the end of the string in field name
df_DMP['FieldName'] = df_DMP['FieldName'].str.strip()

## Merge AVR and IW dataset by FieldName

In [5]:
# Merge the dataframes based on the FieldName column
df_AVR_DMP = df_AVR_cleaned.merge(df_DMP[['FieldName','Area', 'Dry Matter Production Cumulative (kg/ha)']], on='FieldName', how='left')
s_col = df_AVR_DMP.loc[:,['FieldName','Variety','Area','StartDate','AverageTonsHa','AmountYieldTonsDone','AmountHectareDonePercentage','Dry Matter Production Cumulative (kg/ha)']]
df_act_DMP=df_AVR_DMP.drop(df_AVR_DMP.columns.difference(s_col.columns), axis=1)

# Unit conversion tons to kg by Multiplying by 1000
df_act_DMP['Average_kgha']= df_act_DMP['AverageTonsHa']*1000 # Average wet actual weight per ha
df_act_DMP['Wet_Actual_Yield_kgfield']= df_act_DMP['AmountYieldTonsDone']*1000 # Total wet actual weight per field

# Create a new column with DM of each planted varieties per field
df_act_DMP['DM'] = df_act_DMP['Variety'].map(dic_DM)

# Wet to dry weight by multiplying it by Dry Matter content of corresponding planted variety 
df_act_DMP['dry_actual_average_kgha']= df_act_DMP['Average_kgha']*df_act_DMP['DM'] # Average dry actual weight per ha
df_act_DMP['dry_actual_kgfield']= df_act_DMP['Wet_Actual_Yield_kgfield']*df_act_DMP['DM'] # Total dry actual weight per field

# IrriWatch predicted dry yield = DMP * fixed Harvest Index 
df_act_DMP['IW_Predicted_yield_kgha']= df_act_DMP['Dry Matter Production Cumulative (kg/ha)']*HI # Average predicted dry yield per ha
df_act_DMP['IW_Predicted_yield_kgfield']= df_act_DMP['IW_Predicted_yield_kgha']* df_act_DMP ['Area'] # Total predicted dry yield per field

# Rename columns
df_act_DMP = df_act_DMP.rename(columns={'AmountYieldTonsDone': 'Wet_Actual_Yield_tonsfield'})
df_act_DMP = df_act_DMP.rename(columns={'AmountHectareDonePercentage': 'HaDone%'})
df_act_DMP = df_act_DMP.rename(columns={'Dry Matter Production Cumulative (kg/ha)': 'DMPCumulative_kgha'})
df_act_DMP = df_act_DMP.rename(columns={'Area': 'TotalArea_Ha'})
df_act_DMP = df_act_DMP.rename(columns={'StartDate': 'HarvestDate'})
df_act_DMP.head()

Unnamed: 0,FieldName,HarvestDate,Variety,HaDone%,Wet_Actual_Yield_tonsfield,AverageTonsHa,TotalArea_Ha,DMPCumulative_kgha,Average_kgha,Wet_Actual_Yield_kgfield,DM,dry_actual_average_kgha,dry_actual_kgfield,IW_Predicted_yield_kgha,IW_Predicted_yield_kgfield
0,achter schuur fert aardappel,2021-09-27 05:09:37,Ivory Russet,86,136.56,39.810418,3.3,10908.0,39810.418334,136560.0,0.21,8360.18785,28677.6,8181.0,26997.3
1,anny cuypers achter stal,2021-11-02 10:28:59,Fontane,82,229.04,56.437783,4.9,14745.0,56437.783139,229040.0,0.23,12980.690122,52679.2,11058.75,54187.875
2,bart nijs achter paul stessens,2021-11-04 13:56:23,Challenger,98,86.02,94.190042,0.9,13522.0,94190.042009,86020.0,0.221,20815.999284,19010.42,10141.5,9127.35
3,bart tormans achter stal links,2021-10-23 14:34:03,Fontane,91,82.963636,59.968189,1.5,14674.0,59968.18852,82963.63595,0.23,13792.68336,19081.636268,11005.5,16508.25
4,bart torreman bremelhoef,2021-10-24 07:04:06,Fontane,95,211.456398,73.486086,2.6,15330.0,73486.085605,211456.39795,0.23,16901.799689,48634.971528,11497.5,29893.5


In [6]:
# Check for incomplete data, if data not available remove the row (remove the fields with incomplete information)
nan_count = df_act_DMP['DMPCumulative_kgha'].isna().sum()
print(nan_count)
df_act_DMP.dropna(subset=['DMPCumulative_kgha'], inplace=True)
df_act_DMP.head()

22


Unnamed: 0,FieldName,HarvestDate,Variety,HaDone%,Wet_Actual_Yield_tonsfield,AverageTonsHa,TotalArea_Ha,DMPCumulative_kgha,Average_kgha,Wet_Actual_Yield_kgfield,DM,dry_actual_average_kgha,dry_actual_kgfield,IW_Predicted_yield_kgha,IW_Predicted_yield_kgfield
0,achter schuur fert aardappel,2021-09-27 05:09:37,Ivory Russet,86,136.56,39.810418,3.3,10908.0,39810.418334,136560.0,0.21,8360.18785,28677.6,8181.0,26997.3
1,anny cuypers achter stal,2021-11-02 10:28:59,Fontane,82,229.04,56.437783,4.9,14745.0,56437.783139,229040.0,0.23,12980.690122,52679.2,11058.75,54187.875
2,bart nijs achter paul stessens,2021-11-04 13:56:23,Challenger,98,86.02,94.190042,0.9,13522.0,94190.042009,86020.0,0.221,20815.999284,19010.42,10141.5,9127.35
3,bart tormans achter stal links,2021-10-23 14:34:03,Fontane,91,82.963636,59.968189,1.5,14674.0,59968.18852,82963.63595,0.23,13792.68336,19081.636268,11005.5,16508.25
4,bart torreman bremelhoef,2021-10-24 07:04:06,Fontane,95,211.456398,73.486086,2.6,15330.0,73486.085605,211456.39795,0.23,16901.799689,48634.971528,11497.5,29893.5


In [7]:
# Check for duplicate values in the 'FieldName' column
dup = df_act_DMP['FieldName'].duplicated()
num_dup = dup.sum()
print(f"Number of duplicate field names: {num_dup}")

# Print the duplicate field names
if num_dup > 0:
    print("Duplicate field names:")
    print(df_act_DMP[df_act_DMP['FieldName'].duplicated(keep=False)])

# Remove duplicates
df_act_DMP = df_act_DMP.drop_duplicates(subset=['FieldName'], keep='first')

Number of duplicate field names: 9
Duplicate field names:
                    FieldName         HarvestDate  Variety  HaDone%  \
82    paul bens koekoekstraat 2021-10-20 13:22:42  Fontane       89   
83    paul bens koekoekstraat 2021-10-20 13:22:42  Fontane       89   
85        paul biermans nelis 2021-10-29 15:43:33  Fontane       94   
86        paul biermans nelis 2021-10-29 15:43:33  Fontane       94   
87        paul biermans nelis 2021-10-29 17:46:10  Fontane       88   
88        paul biermans nelis 2021-10-29 17:46:10  Fontane       88   
122  tim keysers  hoge rielen 2021-10-08 05:53:14  Fontane       82   
123  tim keysers  hoge rielen 2021-10-08 05:53:14  Fontane       82   
124  tim keysers  hoge rielen 2021-10-08 06:16:06  Fontane       96   
125  tim keysers  hoge rielen 2021-10-08 06:16:06  Fontane       96   
147     war gijs kastelsedijk 2021-10-01 15:41:14  Fontane       96   
148     war gijs kastelsedijk 2021-10-01 15:41:14  Fontane       96   
149     war gijs ka

## Field Geometry

In [8]:
# Read the boundaries shapefile into a geopanda dataframe
field_basics_shp = gpd.read_file(geometry_fields)
column_headers = list(field_basics_shp.columns)

# Remove numbers and () to match the field names and space at the end of the string
pattern = r'[0-9,()]+'
field_basics_shp['name'] =field_basics_shp['name'].str.replace(pattern, '',regex=True)
field_basics_shp = field_basics_shp.rename(columns={'name': 'FieldName'})
field_basics_shp = field_basics_shp.rename(columns={'soil_name_': 'SoilType'})

# Check for duplicate values in the 'FieldName' column
duplicate_fieldnames = field_basics_shp['FieldName'].duplicated()
num_duplicate_fieldnames = duplicate_fieldnames.sum()
print(f"Number of duplicate field names: {num_duplicate_fieldnames}")

# Print the duplicate field names
if num_duplicate_fieldnames > 0:
    print("Duplicate field names:")
    print(field_basics_shp[field_basics_shp['FieldName'].duplicated(keep=False)])
    
# Remove duplicates based on the 'FieldName' column
field_basics_shp = field_basics_shp.drop_duplicates(subset=['FieldName'], keep='first')

# Merge the dataframes based on the 'FieldName' column
df_merged = df_act_DMP.merge(field_basics_shp[['FieldName', 'geometry','SoilType']], on='FieldName', how='left')
df_merged.head()

Number of duplicate field names: 99
Duplicate field names:
     crop  irrigation                       FieldName  soil  priority  \
0     101           5              cools stenen kruis   NaN         1   
1     101           5            harm fabri voor huis   NaN         1   
2     101           5                 cools professor   NaN         1   
3     101           5      jan blokseschuur tegen bos   NaN         1   
4     101           5                jacob watermolen   NaN         1   
..    ...         ...                             ...   ...       ...   
260   101           5                                   NaN         0   
263   101           5      harm fabri naast huis niek   NaN         0   
282   101           5       tim keysers  hoge rielen    NaN         0   
285   101           5       tim keysers  hoge rielen    NaN         0   
290   101           5  van gompel agro herdersdreef B   NaN         0   

     ksat_bias  eff_root_d  sat_soil_m  stress_typ  irrigati_1  

Unnamed: 0,FieldName,HarvestDate,Variety,HaDone%,Wet_Actual_Yield_tonsfield,AverageTonsHa,TotalArea_Ha,DMPCumulative_kgha,Average_kgha,Wet_Actual_Yield_kgfield,DM,dry_actual_average_kgha,dry_actual_kgfield,IW_Predicted_yield_kgha,IW_Predicted_yield_kgfield,geometry,SoilType
0,achter schuur fert aardappel,2021-09-27 05:09:37,Ivory Russet,86,136.56,39.810418,3.3,10908.0,39810.418334,136560.0,0.21,8360.18785,28677.6,8181.0,26997.3,"POLYGON ((5.17598 51.32037, 5.17583 51.32066, ...",Loamy Sand
1,anny cuypers achter stal,2021-11-02 10:28:59,Fontane,82,229.04,56.437783,4.9,14745.0,56437.783139,229040.0,0.23,12980.690122,52679.2,11058.75,54187.875,"POLYGON ((5.01931 51.27656, 5.01929 51.27657, ...",Sandy Loam
2,bart nijs achter paul stessens,2021-11-04 13:56:23,Challenger,98,86.02,94.190042,0.9,13522.0,94190.042009,86020.0,0.221,20815.999284,19010.42,10141.5,9127.35,"POLYGON ((5.04998 51.28358, 5.05013 51.28376, ...",Sandy Loam
3,bart tormans achter stal links,2021-10-23 14:34:03,Fontane,91,82.963636,59.968189,1.5,14674.0,59968.18852,82963.63595,0.23,13792.68336,19081.636268,11005.5,16508.25,"POLYGON ((5.02239 51.26871, 5.02240 51.26869, ...",Sandy Loam
4,bart torreman bremelhoef,2021-10-24 07:04:06,Fontane,95,211.456398,73.486086,2.6,15330.0,73486.085605,211456.39795,0.23,16901.799689,48634.971528,11497.5,29893.5,"POLYGON ((5.02565 51.27869, 5.02667 51.27937, ...",Sandy Loam


In [9]:
# Check for incomplete geometry values and remove the rows with nan values
nan_count = df_merged['geometry'].isna().sum()
print(f'NAN values count before removal:{nan_count}')
df_merged.dropna(subset=['geometry'], inplace=True)
nan_count = df_merged['geometry'].isna().sum()

# Check for duplicate 'FieldName'
duplicate_fieldnames = df_merged['FieldName'].duplicated()

# Count the number of duplicate field names
num_duplicate_fieldnames = duplicate_fieldnames.sum()
print(f"Number of duplicate field names: {num_duplicate_fieldnames}")
if num_duplicate_fieldnames > 0:
    print("Duplicate field names:")
    print(df_merged[df_merged['FieldName'].duplicated(keep=False)])
    
# Remove duplicates based on the 'FieldName' column
df_merged_no_duplicates = df_merged.drop_duplicates(subset=['FieldName'], keep='first')
df_merged_no_duplicates.head()

NAN values count before removal:10
Number of duplicate field names: 0


Unnamed: 0,FieldName,HarvestDate,Variety,HaDone%,Wet_Actual_Yield_tonsfield,AverageTonsHa,TotalArea_Ha,DMPCumulative_kgha,Average_kgha,Wet_Actual_Yield_kgfield,DM,dry_actual_average_kgha,dry_actual_kgfield,IW_Predicted_yield_kgha,IW_Predicted_yield_kgfield,geometry,SoilType
0,achter schuur fert aardappel,2021-09-27 05:09:37,Ivory Russet,86,136.56,39.810418,3.3,10908.0,39810.418334,136560.0,0.21,8360.18785,28677.6,8181.0,26997.3,"POLYGON ((5.17598 51.32037, 5.17583 51.32066, ...",Loamy Sand
1,anny cuypers achter stal,2021-11-02 10:28:59,Fontane,82,229.04,56.437783,4.9,14745.0,56437.783139,229040.0,0.23,12980.690122,52679.2,11058.75,54187.875,"POLYGON ((5.01931 51.27656, 5.01929 51.27657, ...",Sandy Loam
2,bart nijs achter paul stessens,2021-11-04 13:56:23,Challenger,98,86.02,94.190042,0.9,13522.0,94190.042009,86020.0,0.221,20815.999284,19010.42,10141.5,9127.35,"POLYGON ((5.04998 51.28358, 5.05013 51.28376, ...",Sandy Loam
3,bart tormans achter stal links,2021-10-23 14:34:03,Fontane,91,82.963636,59.968189,1.5,14674.0,59968.18852,82963.63595,0.23,13792.68336,19081.636268,11005.5,16508.25,"POLYGON ((5.02239 51.26871, 5.02240 51.26869, ...",Sandy Loam
4,bart torreman bremelhoef,2021-10-24 07:04:06,Fontane,95,211.456398,73.486086,2.6,15330.0,73486.085605,211456.39795,0.23,16901.799689,48634.971528,11497.5,29893.5,"POLYGON ((5.02565 51.27869, 5.02667 51.27937, ...",Sandy Loam


In [10]:
df_merged_no_duplicates.columns

Index(['FieldName', 'HarvestDate', 'Variety', 'HaDone%',
       'Wet_Actual_Yield_tonsfield', 'AverageTonsHa', 'TotalArea_Ha',
       'DMPCumulative_kgha', 'Average_kgha', 'Wet_Actual_Yield_kgfield', 'DM',
       'dry_actual_average_kgha', 'dry_actual_kgfield',
       'IW_Predicted_yield_kgha', 'IW_Predicted_yield_kgfield', 'geometry',
       'SoilType'],
      dtype='object')

In [11]:
# Save as csv
df_merged_no_duplicates.to_csv(csv_output_file_path, index=False)