In [1]:
from pathlib import Path
import pandas as pd
from fuzzywuzzy import process



In [2]:
dir_path = Path.cwd()
raw_data_path = Path.joinpath(dir_path, "data", "raw")
interim_data_path = Path.joinpath(dir_path, "data", "interim")
processed_data_path = Path.joinpath(dir_path, "data", "processed")
ext_data_path = Path.joinpath(dir_path, "data", "external")

In [3]:
lgd = pd.read_csv(Path.joinpath(ext_data_path, "lgd_district.csv"))

In [4]:
lgd.drop(["St_Cs2011_code","St_Cs2001_code","Dt_Cs2011_code", "Dt_Cs2001_code"], axis=1, inplace=True)


In [5]:
lgd = lgd.rename(
    columns={
        "State Name(In English)": "state",
        "District Name(In English)": "district",
    }
)

In [6]:
lgd["state_dist"] = ""
for i in range(0, 734):
    lgd["state_dist"][i] = lgd["state"][i].rstrip() + lgd["district"][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
lgd

Unnamed: 0,St_LGD_code,state,Dt_LGD_code,district,state_dist
0,35,ANDAMAN AND NICOBAR ISLANDS,603,NICOBARS,ANDAMAN AND NICOBAR ISLANDSNICOBARS
1,35,ANDAMAN AND NICOBAR ISLANDS,632,NORTH AND MIDDLE ANDAMAN,ANDAMAN AND NICOBAR ISLANDSNORTH AND MIDDLE AN...
2,35,ANDAMAN AND NICOBAR ISLANDS,602,SOUTH ANDAMANS,ANDAMAN AND NICOBAR ISLANDSSOUTH ANDAMANS
3,28,ANDHRA PRADESH ...,502,ANANTAPUR,ANDHRA PRADESHANANTAPUR
4,28,ANDHRA PRADESH ...,503,CHITTOOR,ANDHRA PRADESHCHITTOOR
...,...,...,...,...,...
729,19,WEST BENGAL ...,319,MURSHIDABAD,WEST BENGALMURSHIDABAD
730,19,WEST BENGAL ...,320,NADIA,WEST BENGALNADIA
731,19,WEST BENGAL ...,704,PASCHIM BARDHAMAN,WEST BENGALPASCHIM BARDHAMAN
732,19,WEST BENGAL ...,306,PURBA BARDHAMAN,WEST BENGALPURBA BARDHAMAN


In [8]:
non_sfac = pd.read_csv(Path.joinpath(processed_data_path, "non_sfac.csv"))

In [9]:
non_sfac.columns

Index(['state', 'fpo_name', 'legal_form', 'reg_no', 'address',
       'contact_details', 'major_crop_names', 'district', 'regn_date'],
      dtype='object')

In [10]:
non_sfac["state"] = non_sfac["state"].str.upper()
non_sfac["district"] = non_sfac["district"].str.upper()

In [11]:
non_sfac['state_dist']  = non_sfac['state'] + non_sfac["district"]

In [12]:
df = pd.merge(
            non_sfac,
            lgd,
            how="outer",
            left_on="state_dist",
            right_on="state_dist",
            validate="m:1",
            indicator=True,
            suffixes=["_DATA", "_LGD"],
        )

In [13]:
df['_merge'].value_counts()

right_only    658
both          297
left_only      41
Name: _merge, dtype: int64

In [14]:
df.columns

Index(['state_DATA', 'fpo_name', 'legal_form', 'reg_no', 'address',
       'contact_details', 'major_crop_names', 'district_DATA', 'regn_date',
       'state_dist', 'St_LGD_code', 'state_LGD', 'Dt_LGD_code', 'district_LGD',
       '_merge'],
      dtype='object')

In [15]:
not_lgd_mapped = df[(df["_merge"] == "left_only")][
            [
                "state_DATA",
                "district_DATA",
                "state_dist",
            ]
        ]

In [16]:
not_lgd_mapped = not_lgd_mapped.drop_duplicates(subset="state_dist")

In [17]:
not_lgd_mapped.drop([45], axis =0, inplace = True)

In [18]:
result = [
            process.extractOne(i, lgd["state_dist"])
            for i in not_lgd_mapped["state_dist"]
        ]
result = pd.DataFrame(result, columns=["match", "score", "id"])
result.drop("id", axis=1, inplace=True)

In [19]:
result

Unnamed: 0,match,score
0,ASSAMNAGAON,91
1,ASSAMBAKSA,80
2,GUJARATKACHCHH,90
3,TAMIL NADUKANNIYAKUMARI,95
4,TELANGANAMAHABUBNAGAR,98
5,TELANGANAWARANGAL RURAL,95
6,UTTAR PRADESHHAMIRPUR,91
7,UTTAR PRADESHBANDA,100
8,UTTAR PRADESHAGRA,91
9,UTTAR PRADESHKasganj,100


In [20]:
not_lgd_proxy_df = (
                pd.DataFrame(not_lgd_mapped["state_dist"], index=None)
                .reset_index()
                .drop("index", axis=1)
        )

In [21]:
mapper_df = pd.concat(
            [not_lgd_proxy_df, result],
            axis=1,
            ignore_index=True,
            names=["original", "match", "score"],
        )

In [22]:
mapper_df = mapper_df[mapper_df[2] >= 90]

In [23]:
mapper_dict = dict(zip(mapper_df[0], mapper_df[1]))
non_sfac["state_dist"] = non_sfac["state_dist"].replace(mapper_dict)

In [24]:
df = pd.merge(
            non_sfac,
            lgd,
            how="outer",
            left_on="state_dist",
            right_on="state_dist",
            validate="m:1",
            indicator=True,
            suffixes=["_SFAC", "_LGD"],
        )


In [25]:
df["_merge"].value_counts()

right_only    649
both          336
left_only       2
Name: _merge, dtype: int64

In [26]:
not_lgd_mapped = df[(df["_merge"] == "left_only")][
            [
                "state_SFAC",
                "district_SFAC",
                "state_dist",
            ]
        ]
not_lgd_mapped = not_lgd_mapped.drop_duplicates(subset="state_dist")

In [27]:
not_lgd_mapped.drop([45], axis =0, inplace = True)

In [28]:
result = [
            process.extractOne(i, lgd["state_dist"])
            for i in not_lgd_mapped["state_dist"]
        ]
result = pd.DataFrame(result, columns=["match", "score", "id"])
result.drop("id", axis=1, inplace=True)

In [29]:
not_lgd_proxy_df = (
                pd.DataFrame(not_lgd_mapped["state_dist"], index=None)
                .reset_index()
                .drop("index", axis=1)
        )

In [30]:
mapper_df = pd.concat(
            [not_lgd_proxy_df, result],
            axis=1,
            ignore_index=True,
            names=["original", "match", "score"],
        )

In [31]:
mapper_df

Unnamed: 0,0,1,2
0,ASSAMMANSA,ASSAMBAKSA,80


In [32]:
mapper_df = mapper_df[mapper_df[2] >= 90]

In [33]:
mapper_dict = dict(zip(mapper_df[0], mapper_df[1]))
non_sfac["state_dist"] = non_sfac["state_dist"].replace(mapper_dict)

In [34]:
df = pd.merge(
            non_sfac,
            lgd,
            how="outer",
            left_on="state_dist",
            right_on="state_dist",
            validate="m:1",
            indicator=True,
            suffixes=["_SFAC", "_LGD"],
        )


In [35]:
df["_merge"].value_counts()

right_only    649
both          336
left_only       2
Name: _merge, dtype: int64

In [36]:
not_lgd_mapped = df[(df["_merge"] == "left_only")][
            [
                "state_SFAC",
                "district_SFAC",
                "state_dist",
            ]
        ]
not_lgd_mapped = not_lgd_mapped.drop_duplicates(subset="state_dist")

In [37]:
len(not_lgd_mapped)

2

In [38]:
not_lgd_mapped.to_csv("non_sfac_unmapped.csv")

In [56]:
df1 = df[df["_merge"] == "both"]

In [57]:
df1 = df1.drop(['_merge', 'district_LGD', 'state_LGD'], axis = 1)

In [58]:
df1.columns.to_list()

['state_SFAC',
 'fpo_name',
 'legal_form',
 'reg_no',
 'address',
 'contact_details',
 'major_crop_names',
 'district_SFAC',
 'regn_date',
 'state_dist',
 'St_LGD_code',
 'Dt_LGD_code']

In [59]:
l = ['state_SFAC',
 'district_SFAC',
 'state_dist',
 'St_LGD_code',
 'Dt_LGD_code',
 'fpo_name',
 'legal_form',
 'reg_no',
 'address',
 'contact_details',
 'major_crop_names',
 'regn_date']

In [60]:
df1 = df1[l]

In [61]:
df1 = df1.rename(columns = {'state_SFAC':'state', 'district_SFAC':'district'})

In [64]:
df1.to_csv(Path.joinpath(processed_data_path, "non_sfac_lgd.csv"), index=False)

In [3]:
data =  pd.read_csv(Path.joinpath(processed_data_path, "non_sfac_lgd.csv"))

In [5]:
data.drop(['state_dist'], axis = 1, inplace = True)
data.rename(columns={'St_LGD_code':'state_lgd_code', 'Dt_LGD_code':'district_lgd_code'}, inplace = True)

In [7]:
data.to_csv(Path.joinpath(processed_data_path, "non_sfac_lgd.csv"), index=False)