In [1]:
# Import packages
import geopandas as gpd
import numpy as np
import pandas as pd
import os
import re
from shapely.geometry import Polygon, MultiPolygon
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

In [2]:
# Import files related to yields and insurance units
yields = pd.read_excel(r'D:\other_thesis\codes_latestversion\general\files\yields.xlsx', index_col = 0)
unit_names = pd.read_excel(r'D:\other_thesis\codes_latestversion\general\files\unit_names.xlsx', index_col = 0)

In [3]:
unit_names['shapeName'] = unit_names['Insurance Unit'].copy()
unit_names.shapeName = unit_names.shapeName.str.lower()
unit_names.shapeName = unit_names.shapeName.str.strip()

In [4]:
# Het valt op dat harapanahalli vanaf 2018 bij ballari behoort en daarvoor bij davanagere. Als ik naar de administrative boundaries kijk, dan behoort harapanahalli wel tot ballari en niet tot davanagere
# Daarom veranderen we in de yield dataset voor alle datapunten waar Taluk = harapanahalli is gegeven, het district naar Ballari (eerst was dat Davanagere). Op internet lees je dat dit
# Taluk deel is van Vijayanagara
# Aangezien insurance_names gebasseerd is op yields, moeten we het ook daar aanpassen
yields['District'] = np.where(yields['Taluk'] == 'harapanahalli', 'ballari', yields['District'])
unit_names['District'] = np.where(unit_names['Taluk'] == 'harapanahalli', 'ballari', unit_names['District'])

Create dataframes containing all geometries on either gram panchayat or hobli level

In [5]:
bound_types = ['gp', 'h'] # Different bound types
column_names = ['KGISGPName', 'KGISHobliN'] # Column names containing the name of the Insurance Unit
path_types = ['C:/Users/mieke/Documents/Msc Thesis/Datasets/Shapefiles/Karnataka_gp_shp/', 'C:/Users/mieke/Documents/Msc Thesis/Datasets/Shapefiles/Karnataka_h_shp/'] # Different paths
#non_indices = [[28, 23, 18, 4], [28, 26, 10]] # District names which do not occur in the yields dataframe (descending order is important) (eg 28:vijayanagara)

In [6]:
path = 'C:/Users/mieke/Documents/Msc Thesis/Datasets/Shapefiles/Taluk/Taluk.shp'
df_t = gpd.read_file(path)

path = 'C:/Users/mieke/Documents/Msc Thesis/Datasets/Shapefiles/District_2016/District_2016.shp'
df_d = gpd.read_file(path)
df_d['KGISDist_1'] = df_d['KGISDist_1'].str.lower()
df_district = df_d[['KGISDistri', 'KGISDist_1', 'geometry']]
# Because the district vijayanagara exists since 2020 and was part of ballari in the past, we join these polygons and call the joined polygon ballari
# We do this via QGIS: https://freegistutorial.com/how-to-export-layer-to-shapefile-on-qgis/, https://www.igismap.com/merge-two-polygons-points-polyline-shapefile/
# We should also make sure that we set 31 (vijayanagara) to 12 (ballari) in the other files

In [7]:
## BOUNDARY DATA
df_gp = pd.DataFrame() # create empty dataframe
df_h = pd.DataFrame() # create empty dataframe

# Loop over each of the bound types separately
for t in range(len(bound_types)):
    path = path_types[t] # Set path
    folder_names = os.listdir(path) # contains all foldernames within the folder

    district_names = [] # create list to include all unique district names within the bounds geodatframe

    # This for loop creates one big geodataframe out of all separate district wise geodataframes
    for i in folder_names:
        input_shp = gpd.read_file(path + i + '/' + i + '.shp')
        district_name = i[3:].lower() # district name as used within the bounds geodataframe
        district_names.append(district_name)
        input_shp['District'] = district_name
        input_shp['KGISDistri'] = i[:2]
        if t == 0:
            df_gp = pd.concat([df_gp, input_shp], ignore_index=True)
        elif t == 1:
            df_h = pd.concat([df_h, input_shp], ignore_index=True)

In [8]:
# Set Vijayanagara to ballari and, hence, 31 to 12. We merged them and called it ballari
df_h['District'] = np.where(df_h['District'] == 'vijayanagara', 'ballari', df_h['District'])
df_h['KGISDistri'] = np.where(df_h['KGISDistri'] == '31', '12', df_h['KGISDistri'])
df_t['KGISDistri'] = np.where(df_t['KGISDistri'] == '31', '12', df_t['KGISDistri'])
df_gp['KGISDistri'] = np.where(df_gp['KGISDistri'] == '31', '12', df_gp['KGISDistri'])
df_gp['District'] = np.where(df_gp['District'] == 'vijayanagara', 'ballari', df_gp['District'])

In [9]:
df_t.head(1)

Unnamed: 0,KGISTalukC,LGD_TalukC,KGISTalukN,KGISDistri,created_us,created_da,last_edite,last_edi_1,SHAPE_STAr,SHAPE_STLe,geometry
0,101,5433,Chikkodi,1,SA,2021-08-21,SURESHBV,2022-05-30,843157000.0,227290.811742,"POLYGON ((461944.526 1803587.439, 461941.512 1..."


In [10]:
df_t['KGISTalukN'] = df_t['KGISTalukN'].str.lower()
df_t['neighbors'] = np.nan
df_t['neighbors'] = df_t['neighbors'].astype('object')
for index, row in df_t.iterrows():
    neighbors = np.array(df_t[df_t.geometry.touches(row['geometry'])].KGISTalukN)
    #overlapping neighbors use if discrepances found with touches
    overlap = np.array(df_t[df_t.geometry.overlaps(row['geometry'])].KGISTalukN)

    df_t.at[index, 'neighbors'] = np.union1d(neighbors, overlap)

In [11]:
# Create dataframe which combines all taluks with its corresponding district
taluk_district = df_t.merge(df_district, how = 'left', on = 'KGISDistri')
taluk_district = taluk_district[['KGISTalukC', 'KGISTalukN', 'KGISDistri', 'KGISDist_1', 'geometry_x', 'neighbors']]
taluk_district['KGISTalukN'] = taluk_district['KGISTalukN'].str.lower().drop_duplicates()
taluk_district = taluk_district.sort_values(['KGISTalukN', 'KGISDist_1'], ascending = [True, True], ignore_index=True)
taluk_district = taluk_district.rename(columns = {'KGISDist_1': 'District', 'geometry_x': 'geometry'})
taluk_names = taluk_district['KGISTalukN'] # List of all taluk names

In [12]:
# Create shapeName column containing the Hobli/Gram Panchayat name in lower case
# Create a geodataframe for Hobli/Gram Panchayat with solely the columns of interest

df_hobli = df_h[['KGISHobliN', 'KGISTalukC', 'District', 'SHAPE_STAr', 'created_da', 'last_edi_1', 'geometry']]
df_hobli['shapeName'] = df_hobli['KGISHobliN'].str.lower()

df_grampan = df_gp[['KGISGPName', 'KGISDistri', 'District', 'SHAPE_STAr', 'created_da', 'last_edi_1', 'geometry']]
df_grampan['shapeName'] = df_grampan['KGISGPName'].str.lower()
df_grampan = df_grampan[~df_grampan['shapeName'].isna()] # drop NaN shapeNames

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [13]:
## We would like to set all district names equal to the names used within the taluk_district gdf (containing administrative district names)
district_names = taluk_district['District'].unique() # list of all unique district names within the yields dataframe
district_names = sorted(district_names) # sort the district names in ascending order

## For unit_names
unique_districts = unit_names['District'].unique() # list of all unique district names within the yields dataframe
unique_districts = sorted(unique_districts) # sort the district names in ascending order

# Make sure that the district names within the yields dataframe are equal to the spelling of the district names within the bounds geodataframe
for i in range(len(unique_districts)):
    unit_names['District'] = np.where(unit_names['District'] == unique_districts[i], district_names[i], unit_names['District'])

## For yields
unique_districts = yields['District'].unique() # list of all unique district names within the yields dataframe
unique_districts = sorted(unique_districts) # sort the district names in ascending order

# Make sure that the district names within the yields dataframe are equal to the spelling of the district names within the bounds geodataframe
for i in range(len(unique_districts)):
    yields['District'] = np.where(yields['District'] == unique_districts[i], district_names[i], yields['District'])

## For df_hobli
unique_districts = df_hobli['District'].unique() # list of all unique district names within the yields dataframe
unique_districts = sorted(unique_districts) # sort the district names in ascending order

# Make sure that the district names within the yields dataframe are equal to the spelling of the district names within the bounds geodataframe
for i in range(len(unique_districts)):
    df_hobli['District'] = np.where(df_hobli['District'] == unique_districts[i], district_names[i], df_hobli['District'])


## For df_grampan
unique_districts = df_grampan['District'].unique() # list of all unique district names within the yields dataframe
unique_districts = sorted(unique_districts) # sort the district names in ascending order

# Make sure that the district names within the yields dataframe are equal to the spelling of the district names within the bounds geodataframe
for i in range(len(unique_districts)):
    df_grampan['District'] = np.where(df_grampan['District'] == unique_districts[i], district_names[i], df_grampan['District'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [14]:
# This cell checks whether each taluk intersects the corresponding district
# Note: we also used contains instead of intersects, but this did not give the desired result
num_intersects = []
for i in taluk_district.index:
    polya = np.array(df_district[df_district['KGISDist_1'] == taluk_district.loc[i, 'District']]['geometry'])[0] # District polygon
    polyb = np.array(taluk_district[taluk_district['KGISTalukN'] == taluk_district.loc[i, 'KGISTalukN']]['geometry'])[0] # Taluk polygon
    num_intersects.append(polya.intersects(polyb))
sum(num_intersects) # Counts the number of True

235

We hebben nu de taluk dataset samengevoegd met de district names van 2016. Nu willen we ook dat in beide bestanden dezelfde namen voor de taluks wordt gebruikt. Hiervoor kijken we dus naar unit_names en taluk_district.

In [15]:
## We now check whether the same Taluk names are used within the yield and boundary files
taluk_yield = unit_names[['Taluk', 'District']].sort_values(['Taluk', 'District'], ignore_index=True).drop_duplicates()
taluk_admin = taluk_district[['KGISTalukN', 'District']].sort_values(['KGISTalukN', 'District'], ignore_index=True).drop_duplicates()

taluk_yield['Taluk_name'] = np.nan
taluk_yield['Similarity'] = np.nan

for i in taluk_yield.index:
    indices = []
    df_temp = taluk_admin[taluk_admin['District'] == taluk_yield['District'].loc[i]]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(taluk_yield.loc[i, 'Taluk'],df_temp.loc[j, 'KGISTalukN'])) # in this case, better than sort_ratio
    taluk_yield.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    taluk_yield.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    taluk_yield.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name

In [16]:
## This cell checks the datapoints for which the taluk names from the yield data are not exactly equal to the matched taluk names within the administrative boundary files  
to_check = taluk_yield[taluk_yield['Taluk'] != taluk_yield['Taluk_name']]

## The taluks are maybe not correctly matched are: hanur (64), hubballi (80), kalaburgi (95), kalaburgi north (96), sandur (156) 
# We use the following website to check which match is correct/most plausible
# https://karnataka.gov.in/district/en
# Besides, we also checked taluk_names to see whether a taluk name within another district is very much alike the taluk name of interest

# hanur (64/1897):
taluk_district[taluk_district['District'] == 'chamarajanagara']
unit_names[unit_names['Taluk'] == 'hanur']
# The match does not need to change: hanur == kollegala(hanur) 100% sure

# # hubballi (80/2398) (nagara means city)
taluk_district[taluk_district['District'] == 'dharwad']
unit_names[unit_names['Taluk'] == 'hubballi']
# # The match needs to change: hubballi == hubli 100% sure
# https://en.wikipedia.org/wiki/Hubli_Taluk
taluk_yield.loc[3085, 'Taluk_name'] = 'hubli'

# # kalaburgi (95/2878)
taluk_district[taluk_district['District'] == 'kalburgi']
unit_names[unit_names['Taluk'] == 'kalaburgi']
# # The match needs to change: kalaburgi == gulbarga 100% sure
# https://vlist.in/sub-district/05582.html
taluk_yield.loc[3622, 'Taluk_name'] = 'gulbarga'

# # kalaburgi north (96/2919)
taluk_district[taluk_district['District'] == 'kalburgi']
unit_names[unit_names['Taluk'] == 'kalaburgi north']
# # The match needs to change: kalaburgi north == kamalapura 100% sure
taluk_yield.loc[3670, 'Taluk_name'] = 'kamalapura'

# # sandur (156/4487)
taluk_district[taluk_district['District'] == 'ballari']
unit_names[unit_names['Taluk'] == 'sandur']
# # The match does not need to change: sandur == sonduru 100% sure

Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,shapeName
72,agrahara,gp,ballari,sandur,maize (makka),agrahara
75,agrahara,gp,ballari,sandur,sorghum (jowar/great millet),agrahara
324,anthapura,gp,ballari,sandur,maize (makka),anthapura
325,anthapura,gp,ballari,sandur,sorghum (jowar/great millet),anthapura
692,bandri,gp,ballari,sandur,maize (makka),bandri
693,bandri,gp,ballari,sandur,sorghum (jowar/great millet),bandri
1063,bhujanganagara,gp,ballari,sandur,maize (makka),bhujanganagara
1168,bommagatta,gp,ballari,sandur,maize (makka),bommagatta
1169,bommagatta,gp,ballari,sandur,sorghum (jowar/great millet),bommagatta
1481,choranuru,h,ballari,sandur,maize (makka),choranuru


In [17]:
## Now, also check whether each polygon is used for just one taluk
print(np.size(taluk_yield,0))
print(np.size(taluk_yield[['Taluk_name', 'District']].drop_duplicates(),0))
taluk_yield[taluk_yield[['Taluk_name', 'District']].duplicated(keep=False)]
# This shows that we use one taluk name more than once: kamalapura. This Taluk name is used for kalaburgi north and kamalapura. 
# However, after studying the yield dataset, this seems as a valid choice 

#yields[(yields['Taluk'] == 'kamalapura')]
#yields[(yields['Taluk'] == 'kalaburgi north')]

195
194


Unnamed: 0,Taluk,District,Taluk_name,Similarity,District_name
3670,kalaburgi north,kalburgi,kamalapura,57.0,kalburgi
3763,kamalapura,kalburgi,kamalapura,100.0,kalburgi


In [18]:
# Hence, we now know which taluk name within the administrative boundary files corresponds to the taluk name within the yield data. 
# For ease, we replace the taluk names within the yield file with the taluk names within the administrative boundary files.
for i in taluk_yield.index:
    yields['Taluk'] = np.where(yields['Taluk'] == taluk_yield.loc[i, 'Taluk'], taluk_yield.loc[i, 'Taluk_name'], yields['Taluk'])
    unit_names['Taluk'] = np.where(unit_names['Taluk'] == taluk_yield.loc[i, 'Taluk'], taluk_yield.loc[i, 'Taluk_name'], unit_names['Taluk'])

We hebben nu de taluk dataset samengevoegd met de district names van 2016. Nu willen we de hobli's en gram panchayat's koppelen aan de taluks. We beginnen hierbij met de hobli's.

In [19]:
# Note that for the hobli level the corresponding taluk names are given
df_hobli_taluk = df_hobli.merge(taluk_district, how = 'left', on = ['KGISTalukC'])
df_hobli_taluk = df_hobli_taluk[['KGISHobliN', 'shapeName', 'KGISTalukC', 'KGISTalukN', 'KGISDistri', 'District_x', 'SHAPE_STAr', 'created_da', 'last_edi_1', 'geometry_x']]
df_hobli_taluk = df_hobli_taluk.rename(columns = {'District_x': 'District', 'geometry_x': 'geometry'})

In [20]:
print(df_t.crs) # epsg:32643
print(df_h.crs) # epsg:32643
print(df_gp.crs) # epsg:32643

# Transform the geometry and yields_geometry dataframes to geodataframes
# Then, save the geometry and yields_geometry geodataframes
taluk_district = gpd.GeoDataFrame(taluk_district, geometry = taluk_district.geometry, crs = {'init': 'epsg:32643'}) # 32643 is the original crs (found by using .crs for one of the input shp files)
#taluk_district.geometry = taluk_district.geometry.to_crs(epsg = 4326) # Set crs to 4326

epsg:32643
epsg:32643
epsg:32643


  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [21]:
# gpd_join_temp = gpd.sjoin(df_grampan, taluk_district, how='left', predicate='intersects') # 12189
# gpd_join_temp1 = gpd_join_temp[gpd_join_temp['District_left'] == gpd_join_temp['District_right']].reset_index() # 10255
# gpd_join_temp1['area'] = np.nan
# gpd_join_temp1 = gpd_join_temp1
# for i in gpd_join_temp1.index:
#     p = np.array(gpd_join_temp1[(gpd_join_temp1['geometry'] == gpd_join_temp1.loc[i, 'geometry'])]['geometry'])[0]
#     q = np.array(taluk_district[(taluk_district['District'] == gpd_join_temp1.loc[i, 'District_left']) & (taluk_district['KGISTalukN'] == gpd_join_temp1.loc[i, 'KGISTalukN'])]['geometry'])[0]
#     gpd_join_temp1.loc[i, 'area'] = p.intersection(q).area
# gpd_join_temp1a = gpd_join_temp1.sort_values(['index', 'area'], ascending = [True, False]) # sort values
# gpd_join_temp2 = gpd_join_temp1a.drop_duplicates(subset = 'index', keep = 'first') # only keep the combination for which the intersected area is largest

In [22]:
gpd_join_temp1 = gpd.read_file(r'C:\Users\mieke\Documents\Msc Thesis\Notebooks Python\Final\gpd_join_temp1.geojson')

In [23]:
## RUN THIS CELL IF YOU WOULD NOT LIKE TO RECREATE THIS EXTENSIVE DATAFRAME AGAIN
gpd_join_temp1 = gpd.read_file(r'C:\Users\mieke\Documents\Msc Thesis\Notebooks Python\Final\gpd_join_temp1.geojson')
gpd_join_temp1a = gpd_join_temp1.sort_values(['index', 'area'], ascending = [True, False]) # sort values
gpd_join_temp2 = gpd_join_temp1a.drop_duplicates(subset = 'index', keep = 'first') # only keep the combination for which the intersected area is largest

In [24]:
# We select the columns of interest from the derived geodataframe
df_grampan_taluk = gpd_join_temp2[['KGISGPName', 'shapeName', 'KGISTalukC', 'KGISTalukN', 'KGISDistri_left', 'District_left', 'SHAPE_STAr', 'geometry']]
df_grampan_taluk = df_grampan_taluk.rename(columns = {'KGISDistri_left': 'KGISDistri', 'District_left': 'District'})

In [25]:
# Next, we drop all duplicates for which we have multiple polygons with equal Insurance Unit name, taluk name and district name. We keep the polygon with the largest area.
df_hobli_taluk = df_hobli_taluk.sort_values(['shapeName', 'KGISTalukN', 'District', 'SHAPE_STAr'], ascending = [True, True, True, False], ignore_index = True)
df_hobli_taluk_sel = df_hobli_taluk.drop_duplicates(subset = ['KGISHobliN', 'shapeName', 'KGISTalukN', 'KGISDistri', 'District'], keep = 'first', ignore_index = True)

df_grampan_taluk = df_grampan_taluk.sort_values(['shapeName', 'KGISTalukN', 'District', 'SHAPE_STAr'], ascending = [True, True, True, False], ignore_index = True)
df_grampan_taluk_sel = df_grampan_taluk.drop_duplicates(subset = ['KGISGPName', 'shapeName', 'KGISTalukN', 'KGISDistri', 'District'], keep = 'first', ignore_index = True)

In [26]:
df_hobli_taluk_sel = df_hobli_taluk_sel.merge(taluk_district[['KGISDistri', 'KGISTalukN', 'neighbors']], how = 'left', on = ['KGISDistri', 'KGISTalukN'])
df_grampan_taluk_sel = df_grampan_taluk_sel.merge(taluk_district[['KGISDistri', 'KGISTalukN', 'neighbors']], how = 'left', on = ['KGISDistri', 'KGISTalukN'])

In [27]:
# Misschien kunnen we beter eerst de unit_names matchen met de gp's en hoblis. Voor hobli's kunnen we dit doen op district+taluk niveau. Voor gp's kunnen we dit doen op district niveau
hobli = unit_names[unit_names['Gram Panchayat/Hobli'] == 'h'].sort_values(['shapeName', 'Taluk', 'District'], ignore_index=True).drop_duplicates(ignore_index=True) # 915
grampan = unit_names[unit_names['Gram Panchayat/Hobli'] == 'gp'].sort_values(['shapeName', 'Taluk', 'District'], ignore_index=True).drop_duplicates(ignore_index=True) # 6230

In [28]:
hobli['KGISTalukN'] = hobli['Taluk']
hobli = hobli.merge(taluk_district[['District', 'KGISTalukN', 'neighbors']], how = 'left', on = ['District', 'KGISTalukN'])
#hobli[hobli['neighbors'].isna()]
grampan['KGISTalukN'] = grampan['Taluk']
grampan = grampan.merge(taluk_district[['District', 'KGISTalukN', 'neighbors']], how = 'left', on = ['District', 'KGISTalukN'])

In [29]:
hobli['Insurance_name'] = np.nan
hobli['shape_name'] = np.nan
hobli['Taluk_name'] = np.nan
hobli['District_name'] = np.nan
hobli['Similarity'] = np.nan
hobli['geometry'] = np.nan

for i in hobli.index:
   indices = []
   df_temp = df_hobli_taluk_sel[(df_hobli_taluk_sel['District'] == hobli['District'].loc[i]) & ((df_hobli_taluk_sel['KGISTalukN'].isin(hobli['neighbors'].loc[i])) | (df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i]))]
    # (df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i])
    # print(hobli.loc[i,:])
    # print(df_temp)
   for j in df_temp.index:
      indices.append(fuzz.token_set_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
   hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
   hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
   hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
   hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
   hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
   hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

## HOBLI

In [30]:
hobli['Insurance_name'] = np.nan
hobli['shape_name'] = np.nan
hobli['Taluk_name'] = np.nan
hobli['District_name'] = np.nan
hobli['Similarity'] = np.nan
hobli['geometry'] = np.nan

for i in hobli.index:
   indices = []
   df_temp = df_hobli_taluk_sel[(df_hobli_taluk_sel['District'] == hobli['District'].loc[i]) & (df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i])]
   #df_temp = df_hobli_taluk_sel[(df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i])]
   for j in df_temp.index:
      indices.append(fuzz.token_set_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
   hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
   hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
   hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
   hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
   hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
   hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

In [31]:
check_hobli = hobli[hobli['shapeName'] != hobli['shape_name']] # We moeten er 47 handmatig toewijzen
check_hobli_df = check_hobli[['shapeName', 'shape_name']]
print(np.size(check_hobli,0)) # 89
hobli_names = sorted(df_hobli_taluk_sel['shapeName'].unique())

89


In [32]:
for i in check_hobli.index:
    indices = []
    df_temp = df_hobli_taluk_sel[(df_hobli_taluk_sel['District'] == hobli['District'].loc[i]) & (df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_hobli2 = hobli[hobli['shapeName'] != hobli['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_hobli2,0))
check_hobli_df2 = check_hobli2[['shapeName', 'shape_name']]

for i in check_hobli2.index:
    indices = []
    df_temp = df_hobli_taluk_sel[((df_hobli_taluk_sel['KGISTalukN'].isin(hobli['neighbors'].loc[i])) | (df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i]))]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_hobli3 = hobli[hobli['shapeName'] != hobli['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_hobli3,0))
check_hobli_df3 = check_hobli3[['shapeName', 'shape_name']]

for i in check_hobli3.index:
    indices = []
    df_temp = df_hobli_taluk_sel[((df_hobli_taluk_sel['KGISTalukN'].isin(hobli['neighbors'].loc[i])) | (df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i]))]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_hobli4 = hobli[hobli['shapeName'] != hobli['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_hobli4,0))
check_hobli_df4 = check_hobli4[['shapeName', 'shape_name']]

hobli_yield = hobli['shapeName'].unique()

# There are just 2 hoblis left which should be matched manually: bidadi and kailancha
# The boundary data has options: bidadi 1 and bidadi 2, kailancha-1 and kailancha-2
# These options do not already occur in the hobli names within the yield data
# Hence, we select the ones with the number 1 (also because they are slightly bigger)
# df_hobli_taluk_sel[df_hobli_taluk_sel['shapeName'] == 'kailancha-1']
# # We do this by changing the boundary name within the geodataframe
df_hobli_taluk_sel['shapeName'] = np.where(df_hobli_taluk_sel['shapeName'] == 'bidadi 1', 'bidadi', df_hobli_taluk_sel['shapeName'])
df_hobli_taluk_sel['shapeName'] = np.where(df_hobli_taluk_sel['shapeName'] == 'kailancha-1', 'kailancha', df_hobli_taluk_sel['shapeName'])

for i in check_hobli4.index:
    indices = []
    df_temp = df_hobli_taluk_sel[((df_hobli_taluk_sel['KGISTalukN'].isin(hobli['neighbors'].loc[i])) | (df_hobli_taluk_sel['KGISTalukN'] == hobli['Taluk'].loc[i]))]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(hobli.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    hobli.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISHobliN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    hobli.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    hobli.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_hobli5 = hobli[hobli['shapeName'] != hobli['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_hobli5,0))
check_hobli_df5 = check_hobli5[['shapeName', 'shape_name']]

78
4
4
0


## GRAM PANCHAYAT

In [33]:
grampan['Insurance_name'] = np.nan
grampan['shape_name'] = np.nan
grampan['Taluk_name'] = np.nan
grampan['District_name'] = np.nan
grampan['Similarity'] = np.nan
grampan['geometry'] = np.nan

for i in grampan.index:
    indices = []
    df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i]) & (df_grampan_taluk_sel['KGISTalukN'] == grampan['Taluk'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

In [34]:
check_grampan = grampan[grampan['shapeName'] != grampan['shape_name']] # we moeten er handmatig 1757 toewijzen
check_grampan_df = check_grampan[['shapeName', 'shape_name']]
print(np.size(check_grampan,0))
grampan_names = sorted(df_grampan_taluk_sel['shapeName'].unique())

1238


In [35]:
for i in check_grampan.index:
    indices = []
    df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i]) & (df_grampan_taluk_sel['KGISTalukN'] == grampan['Taluk'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan2 = grampan[grampan['shapeName'] != grampan['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_grampan2,0))
check_grampan_df2 = check_grampan2[['shapeName', 'shape_name']]

for i in check_grampan2.index:
    indices = []
    #df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
    #df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i]) & ((df_grampan_taluk_sel['KGISTalukN'].isin(grampan['neighbors'].loc[i])) | (df_grampan_taluk_sel['KGISTalukN'] == grampan['Taluk'].loc[i]))]
    df_temp = df_grampan_taluk_sel[((df_grampan_taluk_sel['KGISTalukN'].isin(grampan['neighbors'].loc[i])) | (df_grampan_taluk_sel['KGISTalukN'] == grampan['Taluk'].loc[i]))]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan3 = grampan[grampan['shapeName'] != grampan['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_grampan3,0))
check_grampan_df3 = check_grampan3[['shapeName', 'shape_name']]

for i in check_grampan3.index:
    indices = []
    #df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
    df_temp = df_grampan_taluk_sel[((df_grampan_taluk_sel['KGISTalukN'].isin(grampan['neighbors'].loc[i])) | (df_grampan_taluk_sel['KGISTalukN'] == grampan['Taluk'].loc[i]))]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan4 = grampan[grampan['shapeName'] != grampan['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_grampan4,0))
check_grampan_df4 = check_grampan4[['shapeName', 'shape_name']]
# grampan_yield = grampan['shapeName'].unique()

# for i in check_grampan4.index:
#     indices = []
#     #df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
#     df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['KGISTalukN'].isin(grampan['neighbors'].loc[i]) | (df_grampan_taluk_sel['KGISTalukN'] == grampan['Taluk'].loc[i]))]
#     for j in df_temp.index:
#        indices.append(fuzz.token_sort_ratio(grampan.loc[i, 'shapeName'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
#     grampan.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
#     grampan.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
#     grampan.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
#     grampan.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
#     grampan.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
#     grampan.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

# check_grampan5 = grampan[grampan['shapeName'] != grampan['shape_name']] # We moeten er 5 handmatig toewijzen
# print(np.size(check_grampan5,0))
# check_grampan_df5 = check_grampan5[['shapeName', 'shape_name']]

1230
339
335


In [36]:
#valid_grampan = grampan[grampan['shapeName'] == grampan['shape_name']]
valid_grampan = grampan[(grampan['Similarity'] >= 90) & (grampan['Similarity'] <= 100)]
valid_hobli = hobli[hobli['shapeName'] == hobli['shape_name']]
print(np.size(hobli,0))
print(np.size(valid_hobli,0))
print(np.size(grampan,0))
print(np.size(valid_grampan,0))
print(np.size(grampan,0) - np.size(valid_grampan,0))
print(np.size(check_grampan_df4,0))

915
915
6230
5907
323
335


In [37]:
grampan_all = valid_grampan.copy() ## 2251
grampan_all['geometry_string'] = grampan_all.geometry.astype(str)
#type(grampan_paddy.loc[0,'geometry_string'])
grampan_all = grampan_all.drop(['Crop'], axis=1)
grampan_all = grampan_all.drop_duplicates(['Insurance Unit', 'Gram Panchayat/Hobli', 'District', 'Taluk',\
       'shapeName', 'KGISTalukN', 'Insurance_name', 'shape_name',\
       'Taluk_name', 'District_name'])
grampan_all['geometry_string'].drop_duplicates() # 2250
# grampan_paddy_check = grampan_paddy.drop_duplicates(subset = ['shapeName', 'District', 'Taluk']) # 2188

grampan_all[grampan_all.duplicated(subset = 'geometry_string', keep = False) == True]#['shapeName'].unique() just 18 rows, but delete some

Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,shapeName,KGISTalukN,neighbors,Insurance_name,shape_name,Taluk_name,District_name,Similarity,geometry,geometry_string
261,ankalagi,gp,belagavi,belagavi,ankalagi,belagavi,"[bailhongal, gokak, hukkeri, khanapur, kitthuru]",Ankalagi,ankalagi,belagavi,belagavi,100.0,POLYGON ((457644.7317000012 1740349.2443000006...,POLYGON ((457644.7317000012 1740349.2443000006...
263,ankalagi,gp,belagavi,gokak,ankalagi,gokak,"[bailhongal, belagavi, chikkodi, hukkeri, muda...",Ankalagi,ankalagi,belagavi,belagavi,100.0,POLYGON ((457644.7317000012 1740349.2443000006...,POLYGON ((457644.7317000012 1740349.2443000006...
1277,chincholli,gp,kalburgi,chincholi,chincholli,chincholi,"[chittaguppa, kalagi, kamalapura, sedam]",Chincholli (H),chincholli (h),kalagi,kalburgi,91.0,"POLYGON ((724787.7580999996 1926114.849400001,...","POLYGON ((724787.7580999996 1926114.849400001,..."
1278,chincholli,gp,kalburgi,chittapur,chincholli,chittapur,"[gulbarga, jevargi, kalagi, sedam, shahapur, s...",Chincholli (H),chincholli (h),kalagi,kalburgi,91.0,"POLYGON ((724787.7580999996 1926114.849400001,...","POLYGON ((724787.7580999996 1926114.849400001,..."
1468,dhavaleshwar,gp,belagavi,gokak,dhavaleshwar,gokak,"[bailhongal, belagavi, chikkodi, hukkeri, muda...",Dhavaleshwar,dhavaleshwar,mudalagi,belagavi,100.0,POLYGON ((512087.57670000056 1805228.917400000...,POLYGON ((512087.57670000056 1805228.917400000...
1469,dhavaleshwar,gp,bagalkot,mudhol,dhavaleshwar,mudhol,"[badami, bagalkote, bilagi, jamakhandi, mudala...",Dhavaleshwar,dhavaleshwar,mudalagi,belagavi,100.0,POLYGON ((512087.57670000056 1805228.917400000...,POLYGON ((512087.57670000056 1805228.917400000...
2211,hebbale,gp,hassan,arkalgud,hebbale,arkalgud,"[alur, hassan, holenarasipura, k.r.nagar, kush...",Hebbale,hebbale,arkalgud,hassan,100.0,POLYGON ((622139.4625000011 1417369.3452000003...,POLYGON ((622139.4625000011 1417369.3452000003...
2212,hebbale,gp,kodagu,somavarapete,hebbale,somavarapete,"[alur, arkalgud, kadaba, kushalanagara, madike...",Hebbale,hebbale,arkalgud,hassan,100.0,POLYGON ((622139.4625000011 1417369.3452000003...,POLYGON ((622139.4625000011 1417369.3452000003...
2594,hullatti,gp,haveri,hangal,hullatti,hangal,"[byadagi, haveri, hirekerur, mundgod, savanur,...",Hullatti,hullatti,hangal,haveri,100.0,POLYGON ((512482.45480000007 1646814.287699999...,POLYGON ((512482.45480000007 1646814.287699999...
2596,hullatti,gp,haveri,hirekerur,hullatti,hirekerur,"[byadagi, hangal, ranebennur, ratteehalli, shi...",Hullatti,hullatti,hangal,haveri,100.0,POLYGON ((512482.45480000007 1646814.287699999...,POLYGON ((512482.45480000007 1646814.287699999...


# NOW WE USE THE CHANGED PLATFORM

In [38]:
# Misschien kunnen we beter eerst de unit_names matchen met de gp's en hoblis. Voor hobli's kunnen we dit doen op district+taluk niveau. Voor gp's kunnen we dit doen op district niveau
grampan_copy = unit_names[unit_names['Gram Panchayat/Hobli'] == 'gp'].sort_values(['shapeName', 'Taluk', 'District'], ignore_index=True).drop_duplicates(ignore_index=True) # 6230
grampan_copy['KGISTalukN'] = grampan_copy['Taluk']
grampan_copy = grampan_copy.merge(taluk_district[['District', 'KGISTalukN', 'neighbors']], how = 'left', on = ['District', 'KGISTalukN'])

In [39]:
grampan_copy = grampan_copy.drop(grampan_copy[(grampan_copy['Insurance Unit'] == 'ankalagi') & (grampan_copy['Taluk'] == 'gokak') & (grampan_copy['District'] == 'belagavi')].index) # inplace=True
grampan_copy = grampan_copy.drop(grampan_copy[(grampan_copy['Insurance Unit'] == 'chincholli')& (grampan_copy['Taluk'] == 'chincholi') & (grampan_copy['District'] == 'kalburgi')].index) # inplace=True
grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'dhavaleshwar') & (grampan_copy['Taluk'] == 'gokak') & (grampan_copy['District'] == 'belagavi')), 'mudalagi', grampan_copy['Taluk'])
grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'dhavaleshwar') & (grampan_copy['Taluk'] == 'mudhol') & (grampan_copy['District'] == 'bagalkot')), 'rabakavi banahatti', grampan_copy['Taluk'])
grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'hebbale') & (grampan_copy['Taluk'] == 'somavarapete') & (grampan_copy['District'] == 'kodagu')), 'kushalanagara', grampan_copy['Taluk'])
grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'hullatti') & (grampan_copy['Taluk'] == 'hirekerur') & (grampan_copy['District'] == 'haveri')), 'ratteehalli', grampan_copy['Taluk'])

grampan_copy['Insurance Unit'] = np.where(((grampan_copy['Insurance Unit'] == 'k.ayyanahalli') & (grampan_copy['Taluk'] == 'hadagali') & (grampan_copy['District'] == 'ballari')), 'k.ayyenahalli', grampan_copy['Insurance Unit'])
grampan_copy['Insurance Unit'] = np.where(((grampan_copy['Insurance Unit'] == 'k.ayyenahalli') & (grampan_copy['Taluk'] == 'kudligi') & (grampan_copy['District'] == 'ballari')), 'k.ayyanahalli', grampan_copy['Insurance Unit'])
grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'k.ayyanahalli') & (grampan_copy['Taluk'] == 'kudligi') & (grampan_copy['District'] == 'ballari')), 'kotturu', grampan_copy['Taluk'])

grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'kothegala') & (grampan_copy['Taluk'] == 'heggadadevanakote') & (grampan_copy['District'] == 'mysuru')), 'saraguru', grampan_copy['Taluk'])
grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'mangala') & (grampan_copy['Taluk'] == 'kollegala') & (grampan_copy['District'] == 'chamarajanagara')), 'kollegala(hanur)', grampan_copy['Taluk'])
grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'manuganahalli') & (grampan_copy['Taluk'] == 'heggadadevanakote') & (grampan_copy['District'] == 'mysuru')), 'saraguru', grampan_copy['Taluk'])
grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'mulluru') & (grampan_copy['Taluk'] == 'heggadadevanakote') & (grampan_copy['District'] == 'mysuru')), 'saraguru', grampan_copy['Taluk'])

grampan_copy['Insurance Unit'] = np.where(((grampan_copy['Insurance Unit'] == 'naganuru') & (grampan_copy['Taluk'] == 'gokak') & (grampan_copy['District'] == 'belagavi')), 'naganur', grampan_copy['Insurance Unit'])
grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'naganur') & (grampan_copy['Taluk'] == 'gokak') & (grampan_copy['District'] == 'belagavi')), 'bailhongal', grampan_copy['Taluk'])

grampan_copy['Taluk'] = np.where(((grampan_copy['Insurance Unit'] == 'tigadi') & (grampan_copy['Taluk'] == 'gokak') & (grampan_copy['District'] == 'belagavi')), 'mudalagi', grampan_copy['Taluk'])
grampan_copy['Insurance Unit'] = np.where(((grampan_copy['Insurance Unit'] == 'turanura') & (grampan_copy['Taluk'] == 'ramadurg') & (grampan_copy['District'] == 'belagavi')), 'turanur', grampan_copy['Insurance Unit'])


In [40]:
yields_interest = yields[(yields['Crop'] == 'paddy') | (yields['Crop'] == 'maize (makka)') | (yields['Crop'] == 'sorghum (jowar/great millet)')].reset_index(drop=True)
yields_interest_copy = yields_interest.copy()
yields_interest_copy = yields_interest_copy.drop(yields_interest_copy[(yields_interest_copy['Insurance Unit'] == 'ankalagi') & (yields_interest_copy['Taluk'] == 'gokak') & (yields_interest_copy['District'] == 'belagavi')].index) # inplace=True
yields_interest_copy = yields_interest_copy.drop(yields_interest_copy[(yields_interest_copy['Insurance Unit'] == 'chincholli')& (yields_interest_copy['Taluk'] == 'chincholi') & (yields_interest_copy['District'] == 'kalburgi')].index) # inplace=True
yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'dhavaleshwar') & (yields_interest_copy['Taluk'] == 'gokak') & (yields_interest_copy['District'] == 'belagavi')), 'mudalagi', yields_interest_copy['Taluk'])
yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'dhavaleshwar') & (yields_interest_copy['Taluk'] == 'mudhol') & (yields_interest_copy['District'] == 'bagalkot')), 'rabakavi banahatti', yields_interest_copy['Taluk'])
yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'hebbale') & (yields_interest_copy['Taluk'] == 'somavarapete') & (yields_interest_copy['District'] == 'kodagu')), 'kushalanagara', yields_interest_copy['Taluk'])
yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'hullatti') & (yields_interest_copy['Taluk'] == 'hirekerur') & (yields_interest_copy['District'] == 'haveri')), 'ratteehalli', yields_interest_copy['Taluk'])

yields_interest_copy['Insurance Unit'] = np.where(((yields_interest_copy['Insurance Unit'] == 'k.ayyanahalli') & (yields_interest_copy['Taluk'] == 'hadagali') & (yields_interest_copy['District'] == 'ballari')), 'k.ayyenahalli', yields_interest_copy['Insurance Unit'])
yields_interest_copy['Insurance Unit'] = np.where(((yields_interest_copy['Insurance Unit'] == 'k.ayyenahalli') & (yields_interest_copy['Taluk'] == 'kudligi') & (yields_interest_copy['District'] == 'ballari')), 'k.ayyanahalli', yields_interest_copy['Insurance Unit'])
yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'k.ayyanahalli') & (yields_interest_copy['Taluk'] == 'kudligi') & (yields_interest_copy['District'] == 'ballari')), 'kotturu', yields_interest_copy['Taluk'])


yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'kothegala') & (yields_interest_copy['Taluk'] == 'heggadadevanakote') & (yields_interest_copy['District'] == 'mysuru')), 'saraguru', yields_interest_copy['Taluk'])
yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'mangala') & (yields_interest_copy['Taluk'] == 'kollegala') & (yields_interest_copy['District'] == 'chamarajanagara')), 'kollegala(hanur)', yields_interest_copy['Taluk'])
yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'manuganahalli') & (yields_interest_copy['Taluk'] == 'heggadadevanakote') & (yields_interest_copy['District'] == 'mysuru')), 'saraguru', yields_interest_copy['Taluk'])
yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'mulluru') & (yields_interest_copy['Taluk'] == 'heggadadevanakote') & (yields_interest_copy['District'] == 'mysuru')), 'saraguru', yields_interest_copy['Taluk'])

yields_interest_copy['Insurance Unit'] = np.where(((yields_interest_copy['Insurance Unit'] == 'naganuru') & (yields_interest_copy['Taluk'] == 'gokak') & (yields_interest_copy['District'] == 'belagavi')), 'naganur', yields_interest_copy['Insurance Unit'])
yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'naganur') & (yields_interest_copy['Taluk'] == 'gokak') & (yields_interest_copy['District'] == 'belagavi')), 'bailhongal', yields_interest_copy['Taluk'])

yields_interest_copy['Taluk'] = np.where(((yields_interest_copy['Insurance Unit'] == 'tigadi') & (yields_interest_copy['Taluk'] == 'gokak') & (yields_interest_copy['District'] == 'belagavi')), 'mudalagi', yields_interest_copy['Taluk'])
yields_interest_copy['Insurance Unit'] = np.where(((yields_interest_copy['Insurance Unit'] == 'turanura') & (yields_interest_copy['Taluk'] == 'ramadurg') & (yields_interest_copy['District'] == 'belagavi')), 'turanur', yields_interest_copy['Insurance Unit'])

## GRAMPAN CHANGED

In [41]:
grampan_copy['Insurance_name'] = np.nan
grampan_copy['shape_name'] = np.nan
grampan_copy['Taluk_name'] = np.nan
grampan_copy['District_name'] = np.nan
grampan_copy['Similarity'] = np.nan
grampan_copy['geometry'] = np.nan

for i in grampan_copy.index:
    indices = []
    df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan_copy['District'].loc[i]) & (df_grampan_taluk_sel['KGISTalukN'] == grampan_copy['Taluk'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(grampan_copy.loc[i, 'Insurance Unit'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan_copy.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan_copy.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

In [42]:
check_grampan_copy = grampan_copy[grampan_copy['shapeName'] != grampan_copy['shape_name']] # we moeten er handmatig 1757 toewijzen
check_grampan_df = check_grampan[['shapeName', 'shape_name']]
print(np.size(check_grampan_copy,0))
grampan_names_copy = sorted(df_grampan_taluk_sel['shapeName'].unique())

1226


In [43]:
for i in check_grampan_copy.index:
    indices = []
    df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan_copy['District'].loc[i]) & (df_grampan_taluk_sel['KGISTalukN'] == grampan_copy['Taluk'].loc[i])]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(grampan_copy.loc[i, 'Insurance Unit'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan_copy.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan_copy.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan2_copy = grampan_copy[grampan_copy['shapeName'] != grampan_copy['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_grampan2_copy,0))
check_grampan_df2_copy = check_grampan2_copy[['shapeName', 'shape_name']]

for i in check_grampan2_copy.index:
    indices = []
    #df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
    #df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i]) & ((df_grampan_taluk_sel['KGISTalukN'].isin(grampan['neighbors'].loc[i])) | (df_grampan_taluk_sel['KGISTalukN'] == grampan['Taluk'].loc[i]))]
    df_temp = df_grampan_taluk_sel[((df_grampan_taluk_sel['KGISTalukN'].isin(grampan_copy['neighbors'].loc[i])) | (df_grampan_taluk_sel['KGISTalukN'] == grampan_copy['Taluk'].loc[i]))]
    for j in df_temp.index:
       indices.append(fuzz.token_set_ratio(grampan_copy.loc[i, 'Insurance Unit'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan_copy.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan_copy.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan3_copy = grampan_copy[grampan_copy['shapeName'] != grampan_copy['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_grampan3_copy,0))
check_grampan_df3_copy = check_grampan3_copy[['shapeName', 'shape_name']]

for i in check_grampan3_copy.index:
    indices = []
    #df_temp = df_grampan_taluk_sel[(df_grampan_taluk_sel['District'] == grampan['District'].loc[i])]
    df_temp = df_grampan_taluk_sel[((df_grampan_taluk_sel['KGISTalukN'].isin(grampan_copy['neighbors'].loc[i])) | (df_grampan_taluk_sel['KGISTalukN'] == grampan_copy['Taluk'].loc[i]))]
    for j in df_temp.index:
       indices.append(fuzz.token_sort_ratio(grampan_copy.loc[i, 'Insurance Unit'],df_temp.loc[j, 'shapeName'])) # in this case, better than sort_ratio
    grampan_copy.loc[i,'shape_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'shapeName'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Insurance_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISGPName'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Taluk_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'KGISTalukN'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'District_name'] = df_temp.loc[df_temp.index[np.argmax(indices)],'District'] # the shapeName corresponding to the highest similarity bound
    grampan_copy.loc[i,'Similarity'] = np.max(indices) # the highest similarity found for the current insurance name
    grampan_copy.loc[i,'geometry'] = df_temp.loc[df_temp.index[np.argmax(indices)],'geometry'] # the shapeName corresponding to the highest similarity bound

check_grampan4_copy = grampan_copy[grampan_copy['shapeName'] != grampan_copy['shape_name']] # We moeten er 5 handmatig toewijzen
print(np.size(check_grampan4_copy,0))
check_grampan_df4_copy = check_grampan4_copy[['shapeName', 'shape_name']]

1218
339
335


In [44]:
#valid_grampan_copy = grampan_copy[grampan_copy['shapeName'] == grampan_copy['shape_name']]
valid_grampan_copy = grampan_copy[(grampan_copy['Similarity'] >= 90) & (grampan_copy['Similarity'] <= 100)]
valid_hobli = hobli[hobli['shapeName'] == hobli['shape_name']]
print(np.size(hobli,0))
print(np.size(valid_hobli,0))
print(np.size(grampan,0))
print(np.size(valid_grampan,0))
print(np.size(grampan,0) - np.size(valid_grampan,0))
print(np.size(check_grampan_df4,0))

915
915
6230
5907
323
335


In [45]:
grampan_all_copy = valid_grampan_copy.copy() ## 2251
grampan_all_copy['geometry_string'] = grampan_all_copy.geometry.astype(str)
#type(grampan_paddy.loc[0,'geometry_string'])
grampan_all_copy = grampan_all_copy.drop(['Crop', 'shapeName'], axis=1)
grampan_all_copy = grampan_all_copy.drop_duplicates(['Insurance Unit', 'Gram Panchayat/Hobli', 'District', 'Taluk',\
       'KGISTalukN', 'Insurance_name', 'shape_name',\
       'Taluk_name', 'District_name'])
grampan_all_copy['geometry_string'].drop_duplicates() # 2250
# grampan_paddy_check = grampan_paddy.drop_duplicates(subset = ['shapeName', 'District', 'Taluk']) # 2188

grampan_all_copy[grampan_all_copy.duplicated(subset = 'geometry_string', keep = False) == True]#['shapeName'].unique() just 18 rows, but delete some

Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,KGISTalukN,neighbors,Insurance_name,shape_name,Taluk_name,District_name,Similarity,geometry,geometry_string
4600,naganur,gp,belagavi,bailhongal,bailhongal,"[belagavi, dharwad, gokak, kitthuru, savadatti]",Naganur,naganur,bailhongal,belagavi,100.0,POLYGON ((480837.7021000001 1751139.9185000015...,POLYGON ((480837.7021000001 1751139.9185000015...
4601,naganur,gp,belagavi,bailhongal,gokak,"[bailhongal, belagavi, chikkodi, hukkeri, muda...",Naganur,naganur,bailhongal,belagavi,100.0,POLYGON ((480837.7021000001 1751139.9185000015...,POLYGON ((480837.7021000001 1751139.9185000015...


In [46]:
# df_grampan_taluk_sel[(df_grampan_taluk_sel['shapeName'] == 'turanur') | (df_grampan_taluk_sel['shapeName'] == 'turanura')] # 1 
# # (turanura == turanur voor unit)
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'tigadi'] # 2 
# # (gokak == mudalagai voor taluk) 
# df_grampan_taluk_sel[(df_grampan_taluk_sel['shapeName'] == 'naganuru') | (df_grampan_taluk_sel['shapeName'] == 'naganur')] # 3 
# # # (alleen bailhongal ligt in de buurt, dus goed gekoppeld. let op 1x naganur == naganuru)
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'mulluru'] # 3	
# # # heggadadevanakote taluk: match aan mulluru, taluk:saraguru
# # # hunsur taluk: match aan mulluru, taluk:hunsur#
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'manuganahalli'] # 2
# # # heggadadevanakote taluk: match aan manuganahalli, taluk:saraguru
# # # hunsur taluk: match aan manuganahalli, taluk:hunsur
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'mangala'] # 3
# # # chamarajanagara taluk: match aan mangala, taluk:chamarajanagara
# # # kollegala taluk: match aan manuganahalli, taluk:kollegala(hanur)
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'kothegala'] # 2
# # # heggadadevanakote taluk: match aan kothegala, taluk:saraguru
# # # hunsur taluk: match aan kothegala, taluk:hunsur
# df_grampan_taluk_sel[(df_grampan_taluk_sel['shapeName'] == 'k.ayyenahalli') | (df_grampan_taluk_sel['shapeName'] == 'k.ayyanahalli')] # 1
# # # kudligi taluk: match aan k.ayyanahalli, taluk: kotturu (wss andersom)
# # # hadagali taluk: match aan k.ayyenahalli, taluk: hadagali (wss andersom)
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'hullatti'] # 2 haveri
# # # hangal taluk: match hullatti aan, taluk = hangal
# # # hirekerur taluk: match hullati aan, taluk = rateehalli
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'hebbale'] # 2
# # # arkalgud taluk: match hebbale aan taluk: arkalgud	
# # # somavarapete taluk: match hebbale aan taluk: kushalanagara	
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'dhavaleshwar'] # 2
# # # gokak taluk: match dhavaleshwar aan taluk: mudalagi	
# # # mudhol taluk: match dhavaleshwar aan taluk: rabakavi banahatti	
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'chincholli (h)'] # 1 (eigenlijk zonder h)
# # # Verwijder chincholli voor taluk chincholi (en eventueel ook voor chittapur) voor sorghum
# df_grampan_taluk_sel[df_grampan_taluk_sel['shapeName'] == 'ankalagi'] # 1
# # # Niets veranderen

# OLD DUPLICATES

In [47]:
grampan_paddy = valid_grampan[valid_grampan['Crop'] == 'paddy'] ## 2251
grampan_paddy['geometry_string'] = grampan_paddy.geometry.astype(str)
#type(grampan_paddy.loc[0,'geometry_string'])
grampan_paddy['geometry_string'].drop_duplicates() # 2250
# grampan_paddy_maize = grampan_maize.drop_duplicates(subset = ['shapeName', 'District', 'Taluk']) # 2188
grampan_paddy[grampan_paddy.duplicated(subset = 'geometry_string', keep = False) == True]#['shapeName'].unique() #just 18 rows, but delete some
# delete 3458

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grampan_paddy['geometry_string'] = grampan_paddy.geometry.astype(str)


Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,shapeName,KGISTalukN,neighbors,Insurance_name,shape_name,Taluk_name,District_name,Similarity,geometry,geometry_string
2211,hebbale,gp,hassan,arkalgud,paddy,hebbale,arkalgud,"[alur, hassan, holenarasipura, k.r.nagar, kush...",Hebbale,hebbale,arkalgud,hassan,100.0,POLYGON ((622139.4625000011 1417369.3452000003...,POLYGON ((622139.4625000011 1417369.3452000003...
2212,hebbale,gp,kodagu,somavarapete,paddy,hebbale,somavarapete,"[alur, arkalgud, kadaba, kushalanagara, madike...",Hebbale,hebbale,arkalgud,hassan,100.0,POLYGON ((622139.4625000011 1417369.3452000003...,POLYGON ((622139.4625000011 1417369.3452000003...


In [48]:
grampan_maize = valid_grampan[valid_grampan['Crop'] == 'maize (makka)'] ## 2251
grampan_maize['geometry_string'] = grampan_maize.geometry.astype(str)
#type(grampan_maize.loc[0,'geometry_string'])
grampan_maize['geometry_string'].drop_duplicates() # 2250
# grampan_paddy_maize = grampan_maize.drop_duplicates(subset = ['shapeName', 'District', 'Taluk']) # 2188
grampan_maize[grampan_maize.duplicated(subset = 'geometry_string', keep = False) == True]#['shapeName'].unique() #just 18 rows, but delete some
# delete 3458

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grampan_maize['geometry_string'] = grampan_maize.geometry.astype(str)


Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,shapeName,KGISTalukN,neighbors,Insurance_name,shape_name,Taluk_name,District_name,Similarity,geometry,geometry_string
1468,dhavaleshwar,gp,belagavi,gokak,maize (makka),dhavaleshwar,gokak,"[bailhongal, belagavi, chikkodi, hukkeri, muda...",Dhavaleshwar,dhavaleshwar,mudalagi,belagavi,100.0,POLYGON ((512087.57670000056 1805228.917400000...,POLYGON ((512087.57670000056 1805228.917400000...
1469,dhavaleshwar,gp,bagalkot,mudhol,maize (makka),dhavaleshwar,mudhol,"[badami, bagalkote, bilagi, jamakhandi, mudala...",Dhavaleshwar,dhavaleshwar,mudalagi,belagavi,100.0,POLYGON ((512087.57670000056 1805228.917400000...,POLYGON ((512087.57670000056 1805228.917400000...
2594,hullatti,gp,haveri,hangal,maize (makka),hullatti,hangal,"[byadagi, haveri, hirekerur, mundgod, savanur,...",Hullatti,hullatti,hangal,haveri,100.0,POLYGON ((512482.45480000007 1646814.287699999...,POLYGON ((512482.45480000007 1646814.287699999...
2596,hullatti,gp,haveri,hirekerur,maize (makka),hullatti,hirekerur,"[byadagi, hangal, ranebennur, ratteehalli, shi...",Hullatti,hullatti,hangal,haveri,100.0,POLYGON ((512482.45480000007 1646814.287699999...,POLYGON ((512482.45480000007 1646814.287699999...
2864,k.ayyanahalli,gp,ballari,hadagali,maize (makka),k.ayyanahalli,hadagali,"[hagaribommanahalli, harapanahalli, haveri, ko...",K.Ayyanahalli,k.ayyanahalli,kotturu,ballari,100.0,"POLYGON ((628552.4588000005 1638879.131, 62857...","POLYGON ((628552.4588000005 1638879.131, 62857..."
2865,k.ayyenahalli,gp,ballari,kudligi,maize (makka),k.ayyenahalli,kudligi,"[challakere, hagaribommanahalli, jagaluru, kot...",K.Ayyanahalli,k.ayyanahalli,kotturu,ballari,92.0,"POLYGON ((628552.4588000005 1638879.131, 62857...","POLYGON ((628552.4588000005 1638879.131, 62857..."
4150,mangala,gp,chamarajanagara,chamarajanagara,maize (makka),mangala,chamarajanagara,"[gundlupet, kollegala, kollegala(hanur), nanja...",Mangala,mangala,chamarajanagara,chamarajanagara,100.0,POLYGON ((715249.4564999995 1331456.6883999996...,POLYGON ((715249.4564999995 1331456.6883999996...
4152,mangala,gp,chamarajanagara,kollegala,maize (makka),mangala,kollegala,"[chamarajanagara, kollegala(hanur), malavalli,...",Mangala,mangala,chamarajanagara,chamarajanagara,100.0,POLYGON ((715249.4564999995 1331456.6883999996...,POLYGON ((715249.4564999995 1331456.6883999996...
4601,naganur,gp,belagavi,gokak,maize (makka),naganur,gokak,"[bailhongal, belagavi, chikkodi, hukkeri, muda...",Naganur,naganur,bailhongal,belagavi,100.0,POLYGON ((480837.7021000001 1751139.9185000015...,POLYGON ((480837.7021000001 1751139.9185000015...
4606,naganuru,gp,belagavi,gokak,maize (makka),naganuru,gokak,"[bailhongal, belagavi, chikkodi, hukkeri, muda...",Naganur,naganur,bailhongal,belagavi,93.0,POLYGON ((480837.7021000001 1751139.9185000015...,POLYGON ((480837.7021000001 1751139.9185000015...


In [49]:
grampan_sorghum = valid_grampan[valid_grampan['Crop'] == 'sorghum (jowar/great millet)'] ## 2251
grampan_sorghum['geometry_string'] = grampan_sorghum.geometry.astype(str)
#type(grampan_paddy.loc[0,'geometry_string'])
grampan_sorghum['geometry_string'].drop_duplicates() # 2250
# grampan_paddy_check = grampan_paddy.drop_duplicates(subset = ['shapeName', 'District', 'Taluk']) # 2188
grampan_sorghum[grampan_sorghum.duplicated(subset = 'geometry_string', keep = False) == True]#['shapeName'].unique() just 18 rows, but delete some

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grampan_sorghum['geometry_string'] = grampan_sorghum.geometry.astype(str)


Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,shapeName,KGISTalukN,neighbors,Insurance_name,shape_name,Taluk_name,District_name,Similarity,geometry,geometry_string
1277,chincholli,gp,kalburgi,chincholi,sorghum (jowar/great millet),chincholli,chincholi,"[chittaguppa, kalagi, kamalapura, sedam]",Chincholli (H),chincholli (h),kalagi,kalburgi,91.0,"POLYGON ((724787.7580999996 1926114.849400001,...","POLYGON ((724787.7580999996 1926114.849400001,..."
1278,chincholli,gp,kalburgi,chittapur,sorghum (jowar/great millet),chincholli,chittapur,"[gulbarga, jevargi, kalagi, sedam, shahapur, s...",Chincholli (H),chincholli (h),kalagi,kalburgi,91.0,"POLYGON ((724787.7580999996 1926114.849400001,...","POLYGON ((724787.7580999996 1926114.849400001,..."


In [50]:
# yields_interest = yields[(yields['Crop'] == 'paddy') | (yields['Crop'] == 'maize (makka)') | (yields['Crop'] == 'sorghum (jowar/great millet)')].reset_index(drop=True)
# yields_interest[(yields_interest['Insurance Unit'] == 'chincholli')].sort_values(['Season', 'IRR_RF', 'Year'])

# CHECK FOR NEW DUPLICATES

In [51]:
grampan_paddy_copy = valid_grampan_copy[valid_grampan_copy['Crop'] == 'paddy'] ## 2251
grampan_paddy_copy['geometry_string'] = grampan_paddy_copy.geometry.astype(str)
#type(grampan_paddy_copy.loc[0,'geometry_string'])
grampan_paddy_copy['geometry_string'].drop_duplicates() # 2250
# grampan_paddy_maize = grampan_maize.drop_duplicates(subset = ['shapeName', 'District', 'Taluk']) # 2188
grampan_paddy_copy[grampan_paddy_copy.duplicated(subset = 'geometry_string', keep = False) == True]#['shapeName'].unique() #just 18 rows, but delete some
# delete 3458

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grampan_paddy_copy['geometry_string'] = grampan_paddy_copy.geometry.astype(str)


Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,shapeName,KGISTalukN,neighbors,Insurance_name,shape_name,Taluk_name,District_name,Similarity,geometry,geometry_string


In [52]:
grampan_maize_copy = valid_grampan_copy[valid_grampan_copy['Crop'] == 'maize (makka)'] ## 2251
grampan_maize_copy['geometry_string'] = grampan_maize_copy.geometry.astype(str)
#type(grampan_maize.loc[0,'geometry_string'])
grampan_maize_copy['geometry_string'].drop_duplicates() # 2250
# grampan_paddy_maize = grampan_maize.drop_duplicates(subset = ['shapeName', 'District', 'Taluk']) # 2188
grampan_maize_copy[grampan_maize_copy.duplicated(subset = 'geometry_string', keep = False) == True]#['shapeName'].unique() #just 18 rows, but delete some
# delete 3458

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grampan_maize_copy['geometry_string'] = grampan_maize_copy.geometry.astype(str)


Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,shapeName,KGISTalukN,neighbors,Insurance_name,shape_name,Taluk_name,District_name,Similarity,geometry,geometry_string
4601,naganur,gp,belagavi,bailhongal,maize (makka),naganur,gokak,"[bailhongal, belagavi, chikkodi, hukkeri, muda...",Naganur,naganur,bailhongal,belagavi,100.0,POLYGON ((480837.7021000001 1751139.9185000015...,POLYGON ((480837.7021000001 1751139.9185000015...
4606,naganur,gp,belagavi,bailhongal,maize (makka),naganuru,gokak,"[bailhongal, belagavi, chikkodi, hukkeri, muda...",Naganur,naganur,bailhongal,belagavi,100.0,POLYGON ((480837.7021000001 1751139.9185000015...,POLYGON ((480837.7021000001 1751139.9185000015...
5883,turanur,gp,belagavi,ramadurg,maize (makka),turanur,ramadurg,"[badami, gokak, mudalagi, mudhol, naragund, sa...",Turanur,turanur,ramadurg,belagavi,100.0,POLYGON ((532951.4619000003 1766952.2432999993...,POLYGON ((532951.4619000003 1766952.2432999993...
5884,turanur,gp,belagavi,ramadurg,maize (makka),turanura,ramadurg,"[badami, gokak, mudalagi, mudhol, naragund, sa...",Turanur,turanur,ramadurg,belagavi,100.0,POLYGON ((532951.4619000003 1766952.2432999993...,POLYGON ((532951.4619000003 1766952.2432999993...


In [53]:
yields_interest_copy[(yields_interest_copy['Insurance Unit'] == 'naganur') & (yields_interest_copy['District'] == 'belagavi')]
yields_interest_copy[(yields_interest_copy['Insurance Unit'] == 'turanur') & (yields_interest_copy['District'] == 'belagavi')]

Unnamed: 0,Year,Season,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,IRR_RF,Average Yield(Kg/Ha)
961,2016,Kharif,turanur,gp,belagavi,ramadurg,maize (makka),irr,2993.98
6309,2017,Kharif,turanur,gp,belagavi,ramadurg,maize (makka),irr,7816.24
10978,2018,Kharif,turanur,gp,belagavi,ramadurg,maize (makka),irr,4879.835


In [54]:
yields_interest[((yields_interest['Insurance Unit'] == 'naganur') | (yields_interest['Insurance Unit'] == 'naganuru')) & (yields_interest['District'] == 'belagavi')]
yields_interest[((yields_interest['Insurance Unit'] == 'turanur') | (yields_interest['Insurance Unit'] == 'turanura')) & (yields_interest['District'] == 'belagavi')]

Unnamed: 0,Year,Season,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,IRR_RF,Average Yield(Kg/Ha)
961,2016,Kharif,turanur,gp,belagavi,ramadurg,maize (makka),irr,2993.98
6309,2017,Kharif,turanura,gp,belagavi,ramadurg,maize (makka),irr,7816.24
10978,2018,Kharif,turanura,gp,belagavi,ramadurg,maize (makka),irr,4879.835


In [55]:
grampan_sorghum_copy = valid_grampan_copy[valid_grampan_copy['Crop'] == 'sorghum (jowar/great millet)'] ## 2251
grampan_sorghum_copy['geometry_string'] = grampan_sorghum_copy.geometry.astype(str)
#type(grampan_sorghum_copy.loc[0,'geometry_string'])
grampan_sorghum_copy['geometry_string'].drop_duplicates() # 2250
# grampan_paddy_check = grampan_paddy.drop_duplicates(subset = ['shapeName', 'District', 'Taluk']) # 2188
grampan_sorghum_copy[grampan_sorghum_copy.duplicated(subset = 'geometry_string', keep = False) == True]#['shapeName'].unique() just 18 rows, but delete some

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grampan_sorghum_copy['geometry_string'] = grampan_sorghum_copy.geometry.astype(str)


Unnamed: 0,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,shapeName,KGISTalukN,neighbors,Insurance_name,shape_name,Taluk_name,District_name,Similarity,geometry,geometry_string


## CREATE DATAFRAME

In [56]:
yields_interest_copy['shapeName'] = yields_interest_copy['Insurance Unit'].copy()
yields_interest_copy.shapeName = yields_interest_copy.shapeName.str.lower()
yields_interest_copy.shapeName = yields_interest_copy.shapeName.str.strip()

yields_hobli_copy = yields_interest_copy[yields_interest_copy['Gram Panchayat/Hobli'] == 'h']
yields_grampan_copy = yields_interest_copy[yields_interest_copy['Gram Panchayat/Hobli'] == 'gp']

In [57]:
merge_hobli = yields_hobli_copy.merge(valid_hobli, how = 'left', on = ['shapeName', 'Gram Panchayat/Hobli', 'District', 'Taluk', 'Crop'])
merge_grampan_copy = yields_grampan_copy.merge(valid_grampan_copy, how = 'left', on = ['shapeName', 'Gram Panchayat/Hobli', 'District', 'Taluk', 'Crop'])
print(np.size(merge_grampan_copy,0))
merge_grampan_copy = merge_grampan_copy[~merge_grampan_copy['geometry'].isna()]
print(np.size(merge_grampan_copy,0))
yields_geometries_copy = pd.concat([merge_grampan_copy, merge_hobli], ignore_index=True)
print(26466-25558) # We have 908 observations for which we miss a geometry

19360
18452
908


In [58]:
#yields_geometries_copy[yields_geometries_copy['shapeName'] != yields_geometries_copy['shape_name']]

In [59]:
yields_geometries_copy = yields_geometries_copy[['Year', 'Season', 'Insurance Unit_x', 'shapeName', 'Gram Panchayat/Hobli', 'District', 'Taluk', 'Crop', 'IRR_RF', 'Average Yield(Kg/Ha)', 'geometry']]
yields_geometries_copy = yields_geometries_copy.rename(columns = {'Insurance Unit_x': 'Insurance Unit'})

print(df_t.crs) # epsg:32643
print(df_h.crs) # epsg:32643
print(df_gp.crs) # epsg:32643

# Transform the geometry and yields_geometry dataframes to geodataframes
# Then, save the geometry and yields_geometry geodataframes
yields_geometries_copy = gpd.GeoDataFrame(yields_geometries_copy, geometry = yields_geometries_copy.geometry, crs = {'init': 'epsg:32643'}) # 32643 is the original crs (found by using .crs for one of the input shp files)
print(yields_geometries_copy.crs) # epsg:32643
yields_geometries_copy.geometry = yields_geometries_copy.geometry.to_crs(epsg = 4326) # Set crs to 4326
print(yields_geometries_copy.crs) # epsg:32643
yields_geometries_copy.to_file(r'D:\other_thesis\codes_latestversion\general\files\yields_geometry_largestarea.geojson', driver="GeoJSON") # Save the derived datframe as it takes a long time to run
yields_geometries_copy.head(1)

epsg:32643
epsg:32643
epsg:32643
+init=epsg:32643 +type=crs


  in_crs_string = _prepare_from_proj_string(in_crs_string)


epsg:4326


  pd.Int64Index,


Unnamed: 0,Year,Season,Insurance Unit,shapeName,Gram Panchayat/Hobli,District,Taluk,Crop,IRR_RF,Average Yield(Kg/Ha),geometry
0,2016,Kharif,adagall,adagall,gp,bagalkot,badami,maize (makka),irr,4739.61,"POLYGON ((75.62246 15.97203, 75.62479 15.97126..."


In [60]:
print('Paddy total', np.size(yields_geometries_copy[yields_geometries_copy['Crop'] == 'paddy'],0)) # 8412 (total: 8760)
print('Paddy kharif', np.size(yields_geometries_copy[(yields_geometries_copy['Crop'] == 'paddy') & (yields_geometries_copy['Season'] == 'Kharif')],0)) # 6841 (total: 7115)

print('Maize total', np.size(yields_geometries_copy[yields_geometries_copy['Crop'] == 'maize (makka)'],0)) # 7411 (total: 7710)
print('Maize kharif', np.size(yields_geometries_copy[(yields_geometries_copy['Crop'] == 'maize (makka)') & (yields_geometries_copy['Season'] == 'Kharif')],0)) # 6555 (total: 6803)

print('Sorghum total', np.size(yields_geometries_copy[yields_geometries_copy['Crop'] == 'sorghum (jowar/great millet)'],0)) # 5377 (total: 5644)
print('Sorghum kharif', np.size(yields_geometries_copy[(yields_geometries_copy['Crop'] == 'sorghum (jowar/great millet)') & (yields_geometries_copy['Season'] == 'Rabi')],0)) # 4602 (total: 4862)


Paddy total 8412
Paddy kharif 6841
Maize total 7411
Maize kharif 6555
Sorghum total 5377
Sorghum kharif 4602


In [61]:
yields_interest_copy.head(1)
yields_interest_copy[(yields_interest_copy['Insurance Unit'] == 'k.ayyenahalli') | (yields_interest_copy['Insurance Unit'] == 'k.ayyanahalli')]
yields_interest_copy[(yields_interest_copy['Insurance Unit'] == 'turanur') | (yields_interest_copy['Insurance Unit'] == 'turanuru')]
yields_interest_copy[((yields_interest_copy['Insurance Unit'] == 'naganur') | (yields_interest_copy['Insurance Unit'] == 'naganuru')) & (yields_interest_copy['District'] == 'belagavi')]

Unnamed: 0,Year,Season,Insurance Unit,Gram Panchayat/Hobli,District,Taluk,Crop,IRR_RF,Average Yield(Kg/Ha),shapeName
738,2016,Kharif,naganur,gp,belagavi,bailhongal,maize (makka),irr,5469.97,naganur
6106,2017,Kharif,naganur,gp,belagavi,bailhongal,maize (makka),irr,5658.63,naganur
10811,2018,Kharif,naganur,gp,belagavi,bailhongal,maize (makka),irr,5133.943,naganur
14936,2016,Rabi,naganur,gp,belagavi,bailhongal,sorghum (jowar/great millet),rf,237.29,naganur
15057,2016,Rabi,naganur,gp,belagavi,bailhongal,maize (makka),irr,3878.16,naganur
16757,2017,Rabi,naganur,gp,belagavi,bailhongal,sorghum (jowar/great millet),rf,475.01,naganur
16881,2017,Rabi,naganur,gp,belagavi,bailhongal,maize (makka),irr,3745.21,naganur
19112,2018,Rabi,naganur,gp,belagavi,bailhongal,sorghum (jowar/great millet),rf,356.643,naganur
19207,2018,Rabi,naganur,gp,belagavi,bailhongal,maize (makka),irr,4805.235,naganur


In [62]:
yields_geometries_copy.head(1)
yields_geometries_copy[(yields_geometries_copy['Insurance Unit'] == 'k.ayyenahalli') | (yields_geometries_copy['Insurance Unit'] == 'k.ayyanahalli')]
yields_geometries_copy[(yields_geometries_copy['Insurance Unit'] == 'turanur') | (yields_geometries_copy['Insurance Unit'] == 'turanuru')]
yields_geometries_copy[((yields_geometries_copy['Insurance Unit'] == 'naganur') | (yields_geometries_copy['Insurance Unit'] == 'naganuru')) & (yields_geometries_copy['District'] == 'belagavi')]

Unnamed: 0,Year,Season,Insurance Unit,shapeName,Gram Panchayat/Hobli,District,Taluk,Crop,IRR_RF,Average Yield(Kg/Ha),geometry
574,2016,Kharif,naganur,naganur,gp,belagavi,bailhongal,maize (makka),irr,5469.97,"POLYGON ((74.82104 15.83905, 74.82103 15.83905..."
4967,2017,Kharif,naganur,naganur,gp,belagavi,bailhongal,maize (makka),irr,5658.63,"POLYGON ((74.82104 15.83905, 74.82103 15.83905..."
8951,2018,Kharif,naganur,naganur,gp,belagavi,bailhongal,maize (makka),irr,5133.943,"POLYGON ((74.82104 15.83905, 74.82103 15.83905..."
12506,2016,Rabi,naganur,naganur,gp,belagavi,bailhongal,sorghum (jowar/great millet),rf,237.29,"POLYGON ((74.82104 15.83905, 74.82103 15.83905..."
12608,2016,Rabi,naganur,naganur,gp,belagavi,bailhongal,maize (makka),irr,3878.16,"POLYGON ((74.82104 15.83905, 74.82103 15.83905..."
14045,2017,Rabi,naganur,naganur,gp,belagavi,bailhongal,sorghum (jowar/great millet),rf,475.01,"POLYGON ((74.82104 15.83905, 74.82103 15.83905..."
14152,2017,Rabi,naganur,naganur,gp,belagavi,bailhongal,maize (makka),irr,3745.21,"POLYGON ((74.82104 15.83905, 74.82103 15.83905..."
16001,2018,Rabi,naganur,naganur,gp,belagavi,bailhongal,sorghum (jowar/great millet),rf,356.643,"POLYGON ((74.82104 15.83905, 74.82103 15.83905..."
16083,2018,Rabi,naganur,naganur,gp,belagavi,bailhongal,maize (makka),irr,4805.235,"POLYGON ((74.82104 15.83905, 74.82103 15.83905..."


In [63]:
np.size(yields_geometries_copy,0) # 21200
np.size(yields_interest_copy,0) # 22108
print(22108-21200)
#yields_geometries_copy[yields_geometries_copy['geometry'].isna()] # 0 

908
