In [1]:
# Import packages
import geopandas as gpd
import numpy as np
import pandas as pd
import os
import re
from shapely.geometry import Polygon, MultiPolygon
import rasterio
import rasterstats

In [2]:
# Import files related to crop yields and the crop boundaries
geometries = gpd.read_file(r'C:\Users\mieke\Documents\Msc Thesis\Datasets\Yield Data\Karnataka\geometry.geojson') 
yields = gpd.read_file(r'C:\Users\mieke\Documents\Msc Thesis\Datasets\Yield Data\Karnataka\yields_geometry.geojson') 

In [3]:
# This cell adds the previous average yield for a particular insurance unit if available
yields_extend = yields.copy()
yields_extend['Previous Average Yield(Kg/Ha)'] =  np.nan

for i in range(len(yields_extend)):
    df_temp = yields_extend[(yields_extend['Year'] == yields_extend.loc[i, 'Year'] - 1) & (yields_extend['Insurance Unit'] == yields_extend.loc[i, 'Insurance Unit']) & (yields_extend['IRR_RF'] == yields_extend.loc[i, 'IRR_RF']) & (yields_extend['Gram Panchayat/Hobli'] == yields_extend.loc[i, 'Gram Panchayat/Hobli']) & (yields_extend['District'] == yields_extend.loc[i, 'District']) & (yields_extend['Taluk'] == yields_extend.loc[i, 'Taluk']) & (yields_extend['Crop'] == yields_extend.loc[i, 'Crop'])].copy()
    if df_temp.empty == False:
        df_temp = df_temp.reset_index()
        yields_extend.loc[i, 'Previous Average Yield(Kg/Ha)'] = df_temp['Average Yield(Kg/Ha)'][0]

In [4]:
# Save the extended yields dataframe
yields_extend.to_file(r'C:\Users\mieke\Documents\Msc Thesis\Datasets\Yield Data\Karnataka\yields_temp.geojson', driver="GeoJSON") 

  pd.Int64Index,


In [None]:
# This cell calculates the zonal statistics for each of the parameter images
geometries_adj = geometries.copy()
periods = ['sow', 'harvest', 'between']
path = 'C:/Users/mieke/Documents/Msc Thesis/random_forest/preprocess/'
folder_names = os.listdir(path) # Contains all parameter names

for i in folder_names:
    file_names = os.listdir(path + i + '/sow_harvest') # Contains all files related to parameter i
    
    # The loop calculates the average parameter value for each village boundary. We loop over each image. 
    for j in file_names:
        param = rasterio.open(path + i + '/sow_harvest/' + j, mode='r') 
        param_array = param.read(1)

        if i == 'SSM':
            zonal_param = rasterstats.zonal_stats(geometries_adj, param_array, affine = param.transform, stats = ['mean'], geojson_out = True, nodata = param.nodata, all_touched=True)
        else:
            zonal_param = rasterstats.zonal_stats(geometries_adj, param_array, affine = param.transform, stats = ['mean'], geojson_out = True, nodata = param.nodata)
        
        # The output is added to the geometries_adj dataframe as a column
        geometries_adj[j[:-5]] = np.nan
        for k in range(len(geometries_adj)):
            geometries_adj.loc[k, j[:-5]] = zonal_param[k]['properties']['mean']
            
        # For Land Surface Temperature, we are also interested in the maximum value
        if i == 'LST':
            zonal_param = rasterstats.zonal_stats(geometries_adj, param_array, affine = param.transform, stats = ['max'], geojson_out = True, nodata = param.nodata)
            # The output is added to the geometries_adj dataframe as a column
            max_name = 'max_' + j[:-5]
            geometries_adj[max_name] = np.nan
            for k in range(len(geometries_adj)):
                geometries_adj.loc[k, max_name] = zonal_param[k]['properties']['max']

    # Print statements to check the progress         
        print('File ' + str(file_names.index(j)) + ' done.')
    print('Folder ' + i + ' done.') 

In [None]:
# Save the extended yields dataframe
geometries_adj.to_file(r'C:\Users\mieke\Documents\Msc Thesis\Datasets\Yield Data\Karnataka\geometries_adj.geojson', driver="GeoJSON") 

In [None]:
merge_temp_adj = yields_extend.merge(geometries_adj, on=['Insurance Unit', 'District', 'Gram Panchayat/Hobli', 'geometry'])

In [None]:
# This cell makes sure that we take the average parameter value for the correct year and season for each datapoint within the yield dataframe
periods = ['sow', 'harvest', 'between']
seasons = ['Kharif', 'Rabi', 'Summer']
path = 'C:/Users/mieke/Documents/Msc Thesis/random_forest/preprocess/'
folder_names = os.listdir(path) # Contains all parameter names

for i in folder_names:
    for p in periods:
        col_name = i + '_' + p 
        merge_temp_adj[col_name] = np.nan # Create a column for each parameter within each period
        
        for s in seasons:
            for year in range(2016,2019):
                column = i + '_' + str(year) + '_' + s.lower() + '_' + p # column name corresponding to season and year of datapoint
                merge_temp_adj.loc[(merge_temp_adj['Year'] == year) & (merge_temp_adj['Season'] == s), col_name] = merge_temp_adj[column]
    
        if i == 'LST':
            col_name = 'max_' + i + '_' + p 
            merge_temp_adj[col_name] = np.nan # Create a column for each parameter within each period
            for s in seasons:
                for year in range(2016,2019):
                    column = 'max_' + i + '_' + str(year) + '_' + s.lower() + '_' + p # column name corresponding to season and year of datapoint
                    merge_temp_adj.loc[(merge_temp_adj['Year'] == year) & (merge_temp_adj['Season'] == s), col_name] = merge_temp_adj[column]

In [None]:
# Select the columns of interest to obtain the final geodataframe
rf_geodataframe_adj = merge_temp_adj[['Year', 'Season', 'Insurance Unit', 'Gram Panchayat/Hobli', 'District', 'Taluk', 'Crop', 'IRR_RF', 'Average Yield(Kg/Ha)', 'Previous Average Yield(Kg/Ha)', 'LAI_sow', 'LAI_between', 'LAI_harvest', 'NDVI_sow', 'NDVI_between', 'NDVI_harvest', 'SSM_sow', 'SSM_between', 'SSM_harvest', 'LST_sow', 'LST_between', 'LST_harvest', 'max_LST_sow', 'max_LST_between', 'max_LST_harvest', 'geometry']].copy()

In [None]:
rf_geodataframe_adj.crs # This dataframe is already an geodataframe with crs 4326
# Save the extended yields geodataframe ready to be used for the model
rf_geodataframe_adj.to_file(r'C:\Users\mieke\Documents\Msc Thesis\Datasets\Yield Data\Karnataka\rf_geodataframe1.geojson', driver="GeoJSON") 

In [3]:
# Example on how to open a file
#yields_extend = gpd.read_file(r'C:\Users\mieke\Documents\Msc Thesis\Datasets\Yield Data\Karnataka\yields_temp.geojson') 