In [1]:
# Import packages
import geopandas as gpd
import numpy as np
import pandas as pd
import os
import re
from shapely.geometry import Polygon, MultiPolygon
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

In [2]:
# Create a shapeName column containing the names of each Insurance Unit in lowercase and stripped
insurance_names = pd.read_excel(r'C:\Users\mieke\Documents\Msc Thesis\Datasets\Yield data\Karnataka\insurance_names.xlsx', index_col = 0)
insurance_names['shapeName'] = insurance_names['Insurance Unit'].copy()
insurance_names.shapeName = insurance_names.shapeName.str.lower()
insurance_names.shapeName = insurance_names.shapeName.str.strip()

In [3]:
# Import files related to yields and insurance units
yields = pd.read_excel(r'C:\Users\mieke\Documents\Msc Thesis\Datasets\Yield data\Karnataka\yields.xlsx', index_col = 0)

In [4]:
# Het valt op dat harapanahalli vanaf 2018 bij ballari behoort en daarvoor bij davanagere. Als ik naar de administrative boundaries kijk, dan behoort harapanahalli wel tot ballari en niet tot davanagere
# Daarom veranderen we in de yield dataset voor alle datapunten waar Taluk = harapanahalli is gegeven, het district naar Ballari (eerst was dat Davanagere). Op internet lees je dat dit
# Taluk deel is van Vijayanagara
# Aangezien insurance_names gebasseerd is op yields, moeten we het ook daar aanpassen
yields['District'] = np.where(yields['Taluk'] == 'harapanahalli', 'ballari', yields['District'])
insurance_names['District'] = np.where(insurance_names['Taluk'] == 'harapanahalli', 'ballari', insurance_names['District'])

In [5]:
bound_types = ['gp', 'h'] # Different bound types
column_names = ['KGISGPName', 'KGISHobliN'] # Column names containing the name of the Insurance Unit
path_types = ['C:/Users/mieke/Documents/Msc Thesis/Datasets/Shapefiles/Karnataka_gp_shp/', 'C:/Users/mieke/Documents/Msc Thesis/Datasets/Shapefiles/Karnataka_h_shp/'] # Different paths
#non_indices = [[28, 23, 18, 4], [28, 26, 10]] # District names which do not occur in the yields dataframe (descending order is important) (eg 28:vijayanagara)

In [6]:
path = 'C:/Users/mieke/Documents/Msc Thesis/Datasets/Shapefiles/Taluk/Taluk.shp'
df_t = gpd.read_file(path)

path = 'C:/Users/mieke/Documents/Msc Thesis/Datasets/Shapefiles/District_2016/District_2016.shp'
df_d = gpd.read_file(path)
df_d['KGISDist_1'] = df_d['KGISDist_1'].str.lower()
df_district = df_d[['KGISDistri', 'KGISDist_1', 'geometry']]
# Because the district vijayanagara exists since 2020 and was part of ballari in the past, we join these polygons and call the joined polygon ballari
# We do this via QGIS: https://freegistutorial.com/how-to-export-layer-to-shapefile-on-qgis/, https://www.igismap.com/merge-two-polygons-points-polyline-shapefile/
# We should also make sure that we set 31 (vijayanagara) to 12 (ballari) in the other files

In [7]:
## BOUNDARY DATA
df_gp = pd.DataFrame() # create empty dataframe
df_h = pd.DataFrame() # create empty dataframe

# Loop over each of the bound types separately
for t in range(len(bound_types)):
    path = path_types[t] # Set path
    folder_names = os.listdir(path) # contains all foldernames within the folder

    district_names = [] # create list to include all unique district names within the bounds geodatframe

    # This for loop creates one big geodataframe out of all separate district wise geodataframes
    for i in folder_names:
        input_shp = gpd.read_file(path + i + '/' + i + '.shp')
        district_name = i[3:].lower() # district name as used within the bounds geodataframe
        district_names.append(district_name)
        input_shp['District'] = district_name
        input_shp['KGISDistri'] = i[:2]
        if t == 0:
            df_gp = pd.concat([df_gp, input_shp], ignore_index=True)
        elif t == 1:
            df_h = pd.concat([df_h, input_shp], ignore_index=True)

In [8]:
# Set Vijayanagara to ballari and, hence, 31 to 12. We merged them and called it ballari
df_h['District'] = np.where(df_h['District'] == 'vijayanagara', 'ballari', df_h['District'])
df_h['KGISDistri'] = np.where(df_h['KGISDistri'] == '31', '12', df_h['KGISDistri'])
df_t['KGISDistri'] = np.where(df_t['KGISDistri'] == '31', '12', df_t['KGISDistri'])
df_gp['KGISDistri'] = np.where(df_gp['KGISDistri'] == '31', '12', df_gp['KGISDistri'])
df_gp['District'] = np.where(df_gp['District'] == 'vijayanagara', 'ballari', df_gp['District'])

In [9]:
# Create dataframe which combines all taluks with its corresponding district
taluk_district = df_t.merge(df_district, how = 'left', on = 'KGISDistri')
taluk_district = taluk_district[['KGISTalukC', 'KGISTalukN', 'KGISDistri', 'KGISDist_1', 'geometry_x']]
taluk_district['KGISTalukN'] = taluk_district['KGISTalukN'].str.lower().drop_duplicates()
taluk_district = taluk_district.sort_values(['KGISTalukN', 'KGISDist_1'], ascending = [True, True], ignore_index=True)
taluk_district = taluk_district.rename(columns = {'KGISDist_1': 'District', 'geometry_x': 'geometry'})
taluk_names = taluk_district['KGISTalukN'] # List of all taluk names

We zetten nu in df_grampan, df_hobli, insurance_names en taluk_district dezelfde namen voor de districts en passen alles aan, zodat vijayanagara onder ballari valt.

In [10]:
# Create shapeName column containing the Hobli/Gram Panchayat name in lower case
# Create a geodataframe for Hobli/Gram Panchayat with solely the columns of interest

df_hobli = df_h[['KGISHobliN', 'KGISTalukC', 'District', 'SHAPE_STAr', 'geometry']]
df_hobli['shapeName'] = df_hobli['KGISHobliN'].str.lower()

df_grampan = df_gp[['KGISGPName', 'KGISDistri', 'District', 'SHAPE_STAr', 'geometry']]
df_grampan['shapeName'] = df_grampan['KGISGPName'].str.lower()
df_grampan = df_grampan[~df_grampan['shapeName'].isna()] # drop NaN shapeNames

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [11]:
## We would like to set all district names equal to the names used within the taluk_district gdf (containing administrative district names)
district_names = taluk_district['District'].unique() # list of all unique district names within the yields dataframe
district_names = sorted(district_names) # sort the district names in ascending order

## For insurance_names
unique_districts = insurance_names['District'].unique() # list of all unique district names within the yields dataframe
unique_districts = sorted(unique_districts) # sort the district names in ascending order

# Make sure that the district names within the yields dataframe are equal to the spelling of the district names within the bounds geodataframe
for i in range(len(unique_districts)):
    insurance_names['District'] = np.where(insurance_names['District'] == unique_districts[i], district_names[i], insurance_names['District'])

## For yields
unique_districts = yields['District'].unique() # list of all unique district names within the yields dataframe
unique_districts = sorted(unique_districts) # sort the district names in ascending order

# Make sure that the district names within the yields dataframe are equal to the spelling of the district names within the bounds geodataframe
for i in range(len(unique_districts)):
    yields['District'] = np.where(yields['District'] == unique_districts[i], district_names[i], yields['District'])

## For df_hobli
unique_districts = df_hobli['District'].unique() # list of all unique district names within the yields dataframe
unique_districts = sorted(unique_districts) # sort the district names in ascending order

# Make sure that the district names within the yields dataframe are equal to the spelling of the district names within the bounds geodataframe
for i in range(len(unique_districts)):
    df_hobli['District'] = np.where(df_hobli['District'] == unique_districts[i], district_names[i], df_hobli['District'])


## For df_grampan
unique_districts = df_grampan['District'].unique() # list of all unique district names within the yields dataframe
unique_districts = sorted(unique_districts) # sort the district names in ascending order

# Make sure that the district names within the yields dataframe are equal to the spelling of the district names within the bounds geodataframe
for i in range(len(unique_districts)):
    df_grampan['District'] = np.where(df_grampan['District'] == unique_districts[i], district_names[i], df_grampan['District'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [12]:
# This cell checks whether each taluk intersects the corresponding district
# Note: we also used contains instead of intersects, but this did not give the desired result
num_intersects = []
for i in taluk_district.index:
    polya = np.array(df_district[df_district['KGISDist_1'] == taluk_district.loc[i, 'District']]['geometry'])[0] # District polygon
    polyb = np.array(taluk_district[taluk_district['KGISTalukN'] == taluk_district.loc[i, 'KGISTalukN']]['geometry'])[0] # Taluk polygon
    num_intersects.append(polya.intersects(polyb))
sum(num_intersects) # Counts the number of True

235

We hebben nu de taluk dataset samengevoegd met de district names van 2016. Nu willen we ook dat in beide bestanden dezelfde namen voor de taluks wordt gebruikt. Hiervoor kijken we dus naar insurance_names en taluk_district.

In [13]:
## We now check whether the same Taluk names are used within the yield and boundary files
taluk_yield = insurance_names[['Taluk', 'District']].sort_values(['Taluk', 'District'], ignore_index=True).drop_duplicates()
taluk_admin = taluk_district[['KGISTalukN', 'District']].sort_values(['KGISTalukN', 'District'], ignore_index=True).drop_duplicates()

taluk_yield['Taluk_name'] = np.nan
taluk_yield['Similarity'] = np.nan

for i in taluk_yield.index:
    indices = []
    df_temp = taluk_admin[taluk_admin['District'] == taluk_yield['District'].loc[i]]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(taluk_yield.loc[i, 'Taluk'],df_temp.loc[j, 'KGISTalukN'])) # in this case, better than sort_ratio
    taluk_yield.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    taluk_yield.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    taluk_yield.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name

In [14]:
## This cell checks the datapoints for which the taluk names from the yield data are not exactly equal to the matched taluk names within the administrative boundary files  
to_check = taluk_yield[taluk_yield['Taluk'] != taluk_yield['Taluk_name']]

## The taluks are maybe not correctly matched are: hanur (64), hubballi (80), kalaburgi (95), kalaburgi north (96), sandur (156) 
# We use the following website to check which match is correct/most plausible
# https://karnataka.gov.in/district/en
# Besides, we also checked taluk_names to see whether a taluk name within another district is very much alike the taluk name of interest

# hanur (64/1897):
taluk_district[taluk_district['District'] == 'chamarajanagara']
insurance_names[insurance_names['Taluk'] == 'hanur']
# The match does not need to change: hanur == kollegala(hanur) 100% sure

# # hubballi (80/2398) (nagara means city)
taluk_district[taluk_district['District'] == 'dharwad']
insurance_names[insurance_names['Taluk'] == 'hubballi']
# # The match needs to change: hubballi == hubli 100% sure
# https://en.wikipedia.org/wiki/Hubli_Taluk
taluk_yield.loc[2398, 'Taluk_name'] = 'hubli'

# # kalaburgi (95/2878)
taluk_district[taluk_district['District'] == 'kalburgi']
insurance_names[insurance_names['Taluk'] == 'kalaburgi']
# # The match needs to change: kalaburgi == gulbarga 100% sure
# https://vlist.in/sub-district/05582.html
taluk_yield.loc[2878, 'Taluk_name'] = 'gulbarga'

# # kalaburgi north (96/2919)
taluk_district[taluk_district['District'] == 'kalburgi']
insurance_names[insurance_names['Taluk'] == 'kalaburgi north']
# # The match needs to change: kalaburgi north == kamalapura 100% sure
taluk_yield.loc[2919, 'Taluk_name'] = 'kamalapura'

# # sandur (156/4487)
taluk_district[taluk_district['District'] == 'ballari']
insurance_names[insurance_names['Taluk'] == 'sandur']
# # The match does not need to change: sandur == sonduru 100% sure

Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,shapeName
58,agrahara,gp,ballari,sandur,agrahara
254,anthapura,gp,ballari,sandur,anthapura
544,bandri,gp,ballari,sandur,bandri
824,bhujanganagara,gp,ballari,sandur,bhujanganagara
911,bommagatta,gp,ballari,sandur,bommagatta
1160,choranuru,h,ballari,sandur,choranuru
1161,choranuru,gp,ballari,sandur,choranuru
1239,devagiri,gp,ballari,sandur,devagiri
1517,gollalingamanahalli,gp,ballari,sandur,gollalingamanahalli
1665,h.k.halli,gp,ballari,sandur,h.k.halli


In [15]:
## Now, also check whether each polygon is used for just one taluk
print(np.size(taluk_yield,0))
print(np.size(taluk_yield[['Taluk_name', 'District']].drop_duplicates(),0))
taluk_yield[taluk_yield[['Taluk_name', 'District']].duplicated(keep=False)]
# This shows that we use one taluk name more than once: kamalapura. This Taluk name is used for kalaburgi north and kamalapura. 
# However, after studying the yield dataset, this seems as a valid choice 

#yields[(yields['Taluk'] == 'kamalapura')]
#yields[(yields['Taluk'] == 'kalaburgi north')]

195
194


Unnamed: 0,Taluk,District,Taluk_name,Similarity,District_name
2919,kalaburgi north,kalburgi,kamalapura,57.0,kalburgi
2954,kamalapura,kalburgi,kamalapura,100.0,kalburgi


In [16]:
# Hence, we now know which taluk name within the administrative boundary files corresponds to the taluk name within the yield data. 
# For ease, we replace the taluk names within the yield file with the taluk names within the administrative boundary files.
for i in taluk_yield.index:
    yields['Taluk'] = np.where(yields['Taluk'] == taluk_yield.loc[i, 'Taluk'], taluk_yield.loc[i, 'Taluk_name'], yields['Taluk'])
    insurance_names['Taluk'] = np.where(insurance_names['Taluk'] == taluk_yield.loc[i, 'Taluk'], taluk_yield.loc[i, 'Taluk_name'], insurance_names['Taluk'])


We hebben nu de taluk dataset samengevoegd met de district names van 2016. Nu willen we de hobli's en gram panchayat's koppelen aan de taluks. We beginnen hierbij met de hobli's.

In [17]:
# Note that for the hobli level the corresponding taluk names are given
df_hobli_taluk = df_hobli.merge(taluk_district, how = 'left', on = ['KGISTalukC'])
df_hobli_taluk = df_hobli_taluk[['KGISHobliN', 'shapeName', 'KGISTalukC', 'KGISTalukN', 'KGISDistri', 'District_x', 'SHAPE_STAr', 'geometry_x']]
df_hobli_taluk = df_hobli_taluk.rename(columns = {'District_x': 'District', 'geometry_x': 'geometry'})

In [18]:
print(df_t.crs) # epsg:32643
print(df_h.crs) # epsg:32643
print(df_gp.crs) # epsg:32643

# Transform the geometry and yields_geometry dataframes to geodataframes
# Then, save the geometry and yields_geometry geodataframes
taluk_district = gpd.GeoDataFrame(taluk_district, geometry = taluk_district.geometry, crs = {'init': 'epsg:32643'}) # 32643 is the original crs (found by using .crs for one of the input shp files)
#taluk_district.geometry = taluk_district.geometry.to_crs(epsg = 4326) # Set crs to 4326

epsg:32643
epsg:32643
epsg:32643


  in_crs_string = _prepare_from_proj_string(in_crs_string)


### HIER ZIJN WE GESTOPT MET OPNIEUW RUNNEN VOOR 'SHAPE_STAr'

In [19]:
gpd_join_temp = gpd.sjoin(df_grampan, taluk_district, how='left', predicate='intersects') # 12189
gpd_join_temp1 = gpd_join_temp[gpd_join_temp['District_left'] == gpd_join_temp['District_right']].reset_index() # 10255
gpd_join_temp1['area'] = np.nan
gpd_join_temp1 = gpd_join_temp1
for i in gpd_join_temp1.index:
    p = np.array(gpd_join_temp1[(gpd_join_temp1['geometry'] == gpd_join_temp1.loc[i, 'geometry'])]['geometry'])[0]
    q = np.array(taluk_district[(taluk_district['District'] == gpd_join_temp1.loc[i, 'District_left']) & (taluk_district['KGISTalukN'] == gpd_join_temp1.loc[i, 'KGISTalukN'])]['geometry'])[0]
    gpd_join_temp1.loc[i, 'area'] = p.intersection(q).area
gpd_join_temp1a = gpd_join_temp1.sort_values(['index', 'area'], ascending = [True, False]) # sort values
gpd_join_temp2 = gpd_join_temp1a.drop_duplicates(subset = 'index', keep = 'first') # only keep the combination for which the intersected area is largest

KeyboardInterrupt: 

In [20]:
gpd_join_temp1.to_file(r'C:\Users\mieke\Documents\Msc Thesis\Notebooks Python\gpd_join_temp1.geojson', driver="GeoJSON") # Save the derived datframe as it takes a long time to run

  pd.Int64Index,


In [24]:
## RUN THIS CELL IF YOU WOULD NOT LIKE TO RECREATE THIS EXTENSIVE DATAFRAME AGAIN
gpd_join_temp1 = gpd.read_file(r'C:\Users\mieke\Documents\Msc Thesis\Notebooks Python\gpd_join_temp1.geojson')
gpd_join_temp1a = gpd_join_temp1.sort_values(['index', 'area'], ascending = [True, False]) # sort values
gpd_join_temp2 = gpd_join_temp1a.drop_duplicates(subset = 'index', keep = 'first') # only keep the combination for which the intersected area is largest


In [25]:
# We select the columns of interest from the derived geodataframe
df_grampan_taluk = gpd_join_temp2[['KGISGPName', 'shapeName', 'KGISTalukC', 'KGISTalukN', 'KGISDistri_left', 'District_left', 'SHAPE_STAr', 'geometry']]
df_grampan_taluk = df_grampan_taluk.rename(columns = {'KGISDistri_left': 'KGISDistri', 'District_left': 'District'})

In [26]:
# Next, we drop all duplicates for which we have multiple polygons with equal Insurance Unit name, taluk name and district name. We keep the polygon with the largest area.
df_hobli_taluk = df_hobli_taluk.sort_values(['shapeName', 'KGISTalukN', 'District', 'SHAPE_STAr'], ascending = [True, True, True, False], ignore_index = True)
df_hobli_taluk_sel = df_hobli_taluk.drop_duplicates(subset = ['shapeName', 'KGISTalukN', 'District'], keep = 'first', ignore_index = True)

df_grampan_taluk = df_grampan_taluk.sort_values(['shapeName', 'KGISTalukN', 'District', 'SHAPE_STAr'], ascending = [True, True, True, False], ignore_index = True)
df_grampan_taluk_sel = df_grampan_taluk.drop_duplicates(subset = ['shapeName', 'KGISTalukN', 'District'], keep = 'first', ignore_index = True)

In [27]:
# Misschien kunnen we beter eerst de insurance_names matchen met de gp's en hoblis. Voor hobli's kunnen we dit doen op district+taluk niveau. Voor gp's kunnen we dit doen op district niveau
hobli = insurance_names[insurance_names['Gram Panchayat/Hobli'] == 'h'].sort_values(['shapeName', 'Taluk', 'District'], ignore_index=True).drop_duplicates(ignore_index=True)
grampan = insurance_names[insurance_names['Gram Panchayat/Hobli'] == 'gp'].sort_values(['shapeName', 'Taluk', 'District'], ignore_index=True).drop_duplicates(ignore_index=True)

In [28]:
hobli['Insurance_name'] = np.nan
hobli['shape_name'] = np.nan
hobli['Taluk_name'] = np.nan
hobli['District_name'] = np.nan
hobli['Similarity'] = np.nan
hobli['geometry'] = np.nan

for i in hobli.index:
    indices = []
    df_temp = df_hobli_taluk_sel[(df_hobli_taluk_sel['District'] == hobli['District'].loc[i]) & (df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

In [29]:
grampan['Insurance_name'] = np.nan
grampan['shape_name'] = np.nan
grampan['Taluk_name'] = np.nan
grampan['District_name'] = np.nan
grampan['Similarity'] = np.nan
grampan['geometry'] = np.nan

for i in grampan.index:
    indices = []
    df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i]) & (df_grampan_taluk_sel['KGISTalukN'] == grampan['Taluk'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

In [30]:
check_hobli = hobli[hobli['shapeName'] != hobli['shape_name']] # We moeten er 47 handmatig toewijzen
check_hobli_df = check_hobli[['shapeName', 'shape_name']]
hobli_names = sorted(df_hobli_taluk_sel['shapeName'].unique())

check_grampan = grampan[grampan['shapeName'] != grampan['shape_name']] # we moeten er handmatig 1757 toewijzen
check_grampan_df = check_grampan[['shapeName', 'shape_name']]
grampan_names = sorted(df_grampan_taluk_sel['shapeName'].unique())

In [31]:
for i in check_hobli.index:
    indices = []
    df_temp = df_hobli_taluk_sel[(df_hobli_taluk_sel['District'] == hobli['District'].loc[i]) & (df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_hobli2 = hobli[hobli['shapeName'] != hobli['shape_name']] # We moeten er 5 handmatig toewijzen
check_hobli_df2 = check_hobli2[['shapeName', 'shape_name']]

for i in check_hobli2.index:
    indices = []
    df_temp = df_hobli_taluk_sel[(df_hobli_taluk_sel['District'] == hobli['District'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_hobli3 = hobli[hobli['shapeName'] != hobli['shape_name']] # We moeten er 5 handmatig toewijzen
check_hobli_df3 = check_hobli3[['shapeName', 'shape_name']]

for i in check_hobli3.index:
    indices = []
    df_temp = df_hobli_taluk_sel[(df_hobli_taluk_sel['District'] == hobli['District'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_hobli4 = hobli[hobli['shapeName'] != hobli['shape_name']] # We moeten er 5 handmatig toewijzen
check_hobli_df4 = check_hobli4[['shapeName', 'shape_name']]

# There are just 2 hoblis left which should be matched manually: bidadi and kailancha
# The boundary data has options: bidadi 1 and bidadi 2, kailancha-1 and kailancha-2
# These options do not already occur in the hobli names within the yield data
hobli_yield = hobli['shapeName'].unique()
# Hence, we select the ones with the number 1 (also because they are slightly bigger)
# df_hobli_taluk_sel[df_hobli_taluk_sel['shapeName'] == 'kailancha-1']

# We do this by changing the boundary name within the geodataframe
df_hobli_taluk_sel['shapeName'] = np.where(df_hobli_taluk_sel['shapeName'] == 'bidadi 1', 'bidadi', df_hobli_taluk_sel['shapeName'])
df_hobli_taluk_sel['shapeName'] = np.where(df_hobli_taluk_sel['shapeName'] == 'kailancha-1', 'kailancha', df_hobli_taluk_sel['shapeName'])

for i in check_hobli4.index:
    indices = []
    df_temp = df_hobli_taluk_sel[(df_hobli_taluk_sel['District'] == hobli['District'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_hobli5 = hobli[hobli['shapeName'] != hobli['shape_name']] 
check_hobli_df5 = check_hobli5[['shapeName', 'shape_name']] # empty

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hobli_taluk_sel['shapeName'] = np.where(df_hobli_taluk_sel['shapeName'] == 'bidadi 1', 'bidadi', df_hobli_taluk_sel['shapeName'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hobli_taluk_sel['shapeName'] = np.where(df_hobli_taluk_sel['shapeName'] == 'kailancha-1', 'kailancha', df_hobli_taluk_sel['shapeName'])


In [32]:
for i in check_grampan.index:
    indices = []
    df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i]) & (df_grampan_taluk_sel['KGISTalukN'] == grampan['Taluk'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan2 = grampan[grampan['shapeName'] != grampan['shape_name']] # We moeten er 5 handmatig toewijzen
check_grampan_df2 = check_grampan2[['shapeName', 'shape_name']]

for i in check_grampan2.index:
    indices = []
    df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan3 = grampan[grampan['shapeName'] != grampan['shape_name']] # We moeten er 5 handmatig toewijzen
check_grampan_df3 = check_grampan3[['shapeName', 'shape_name']]

for i in check_grampan3.index:
    indices = []
    df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan4 = grampan[grampan['shapeName'] != grampan['shape_name']] # We moeten er 5 handmatig toewijzen
check_grampan_df4 = check_grampan4[['shapeName', 'shape_name']]

# There are just 2 hoblis left which should be matched manually: bidadi and kailancha
# The boundary data has options: bidadi 1 and bidadi 2, kailancha-1 and kailancha-2
# These options do not already occur in the hobli names within the yield data
grampan_yield = grampan['shapeName'].unique()
# Hence, we select the ones with the number 1 (also because they are slightly bigger)
# df_hobli_taluk_sel[df_hobli_taluk_sel['shapeName'] == 'kailancha-1']

# We do this by changing the boundary name within the geodataframe
# df_grampan_taluk_sel['shapeName'] = np.where(df_grampan_taluk_sel['shapeName'] == 'bidadi 1', 'bidadi', df_grampan_taluk_sel['shapeName'])
# df_grampan_taluk_sel['shapeName'] = np.where(df_grampan_taluk_sel['shapeName'] == 'kailancha-1', 'kailancha', df_grampan_taluk_sel['shapeName'])

# for i in check_grampan4.index:
#     indices = []
#     df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
#     for j in df_temp.index:
#        indices.append(fuzz.token_sort_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
#     grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
#     grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
#     grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
#     grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
#     grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
#     grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

# check_grampan5 = grampan[grampan['shapeName'] != grampan['shape_name']] 
# check_grampan_df5 = check_grampan5[['shapeName', 'shape_name']] # empty

In [33]:
for i in check_grampan4.index:
    indices = []
    df_temp = df_grampan_taluk_sel.copy()#[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan5 = grampan[grampan['shapeName'] != grampan['shape_name']] 
check_grampan_df5 = check_grampan5[['shapeName', 'shape_name']] # empty

for i in check_grampan5.index:
    indices = []
    df_temp = df_grampan_taluk_sel.copy()#[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan6 = grampan[grampan['shapeName'] != grampan['shape_name']] 
check_grampan_df6 = check_grampan6[['shapeName', 'shape_name']] # empty

In [34]:
hobli_names_taluk = df_hobli_taluk[['shapeName', 'KGISTalukN']].sort_values(['shapeName', 'KGISTalukN'])
grampan_names_taluk = df_grampan_taluk[['shapeName', 'KGISTalukN']].sort_values(['shapeName', 'KGISTalukN'])

In [42]:
hobli_gdf = gpd.GeoDataFrame(hobli, geometry = hobli.geometry, crs = {'init': 'epsg:32643'}) # 32643 is the original crs (found by using .crs for one of the input shp files)
grampan_gdf = gpd.GeoDataFrame(grampan, geometry = grampan.geometry, crs = {'init': 'epsg:32643'}) # 32643 is the original crs (found by using .crs for one of the input shp files)

hobli_gdf.to_file(r'C:\Users\mieke\Documents\Msc Thesis\Notebooks Python\hobli.geojson', driver="GeoJSON") # Save the derived datframe as it takes a long time to run
grampan_gdf.to_file(r'C:\Users\mieke\Documents\Msc Thesis\Notebooks Python\grampan.geojson', driver="GeoJSON") # Save the derived datframe as it takes a long time to run

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  pd.Int64Index,
  pd.Int64Index,


TROEP!!

In [41]:
len(yields[yields['Crop'] == 'paddy']['Insurance Unit'].unique())
len(yields[yields['Crop'] == 'maize (makka)']['Insurance Unit'].unique())
len(yields[yields['Crop'] == 'sorghum (jowar/great millet)']['Insurance Unit'].unique())

1962

In [30]:
# i=225
# gdp_temp = taluk_district[taluk_district['District'] == df_grampan.loc[i, 'District']] # select taluks within the district for i
# selected = df_grampan[(df_grampan['shapeName'] == df_grampan.loc[i,'shapeName']) & (df_grampan['District'] == df_grampan.loc[i,'District']) & (df_grampan['geometry'] == df_grampan.loc[i,'geometry'])]
# gpd_join_temp = gpd.overlay(selected, gpd_temp, how='intersection')
# gpd_join_temp
# # df_join_temp = gpd.sjoin(selected, gdp_temp, how='left', predicate='intersects')
# # df_join_temp

In [31]:
#geopandas.overlay(df1, df2, how='intersection', keep_geom_type=None, make_valid=True)
# Misschien kunnen we een loop schrijven die per gram panchayat checkt welke taluks ie intersect. Maar dan wel alleen voor de taluks binnen het bijbehorende district
# df_gp_taluk = gpd.GeoDataFrame()
# lengths = []
# for i in df_grampan.index:
#     gpd_temp = taluk_district[taluk_district['District'] == df_grampan.loc[i, 'District']] # select taluks within the district for i
#     selected = df_grampan[(df_grampan['shapeName'] == df_grampan.loc[i,'shapeName']) & (df_grampan['District'] == df_grampan.loc[i,'District'])]
#     gpd_join_temp = gpd.overlay(selected, gpd_temp, how='intersection')
#     df_gp_taluk = pd.concat([df_gp_taluk, gpd_join_temp], ignore_index = True)
#     lengths.append(np.size(selected,0))

# df_gp_taluk

In [None]:
# Misschien kunnen we een loop schrijven die per gram panchayat checkt welke taluks ie intersect. Maar dan wel alleen voor de taluks binnen het bijbehorende district
# df_gp_taluk = gpd.GeoDataFrame()
# lengths = []
# for i in df_grampan.index:
#     gpd_temp = taluk_district[taluk_district['District'] == df_grampan.loc[i, 'District']] # select taluks within the district for i
#     selected = df_grampan[(df_grampan['shapeName'] == df_grampan.loc[i,'shapeName']) & (df_grampan['District'] == df_grampan.loc[i,'District']) & (df_grampan['geometry'] == df_grampan.loc[i,'geometry'])]
#     gpd_join_temp = gpd.sjoin(selected, gpd_temp, how='left', predicate='intersects')
#     df_gp_taluk = pd.concat([df_gp_taluk, gpd_join_temp], ignore_index = True)
#     lengths.append(np.size(selected,0))

# df_gp_taluk

In [None]:
# i = 0
# p = np.array(gpd_join_temp1[(gpd_join_temp1['geometry'] == gpd_join_temp1.loc[i, 'geometry'])]['geometry'])[0]
# q = np.array(taluk_district[(taluk_district['District'] == gpd_join_temp1.loc[i, 'District_left']) & (taluk_district['KGISTalukN'] == gpd_join_temp1.loc[i, 'KGISTalukN'])]['geometry'])[0]

# p.intersection(q).area

24414552.645020902

Onthoud ff dat er ook spelfouten kunnen zijn binnen de insurance unit names in de crop yield dataset.