In [1]:
from pathlib import Path
import pandas as pd
from fuzzywuzzy import process



In [2]:
dir_path = Path.cwd()
raw_data_path = Path.joinpath(dir_path, "data", "raw")
interim_data_path = Path.joinpath(dir_path, "data", "interim")
processed_data_path = Path.joinpath(dir_path, "data", "processed")
ext_data_path = Path.joinpath(dir_path, "data", "external")

In [3]:
lgd = pd.read_csv(Path.joinpath(ext_data_path, "lgd_district.csv"))

In [4]:
lgd.drop(["St_Cs2011_code","St_Cs2001_code","Dt_Cs2011_code", "Dt_Cs2001_code"], axis=1, inplace=True)


In [5]:
lgd = lgd.rename(
    columns={
        "State Name(In English)": "state",
        "District Name(In English)": "district",
    }
)

In [6]:
lgd["state_dist"] = ""
for i in range(0, 734):
    lgd["state_dist"][i] = lgd["state"][i].rstrip() + lgd["district"][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
lgd

Unnamed: 0,St_LGD_code,state,Dt_LGD_code,district,state_dist
0,35,ANDAMAN AND NICOBAR ISLANDS,603,NICOBARS,ANDAMAN AND NICOBAR ISLANDSNICOBARS
1,35,ANDAMAN AND NICOBAR ISLANDS,632,NORTH AND MIDDLE ANDAMAN,ANDAMAN AND NICOBAR ISLANDSNORTH AND MIDDLE AN...
2,35,ANDAMAN AND NICOBAR ISLANDS,602,SOUTH ANDAMANS,ANDAMAN AND NICOBAR ISLANDSSOUTH ANDAMANS
3,28,ANDHRA PRADESH ...,502,ANANTAPUR,ANDHRA PRADESHANANTAPUR
4,28,ANDHRA PRADESH ...,503,CHITTOOR,ANDHRA PRADESHCHITTOOR
...,...,...,...,...,...
729,19,WEST BENGAL ...,319,MURSHIDABAD,WEST BENGALMURSHIDABAD
730,19,WEST BENGAL ...,320,NADIA,WEST BENGALNADIA
731,19,WEST BENGAL ...,704,PASCHIM BARDHAMAN,WEST BENGALPASCHIM BARDHAMAN
732,19,WEST BENGAL ...,306,PURBA BARDHAMAN,WEST BENGALPURBA BARDHAMAN


In [8]:
cattle = pd.read_csv(Path.joinpath(processed_data_path, "cattle.csv"))
buffalo = pd.read_csv(Path.joinpath(processed_data_path, "buffalo.csv"))
goat = pd.read_csv(Path.joinpath(processed_data_path, "goat.csv"))
pig = pd.read_csv(Path.joinpath(processed_data_path, "pig.csv"))
sheep = pd.read_csv(Path.joinpath(processed_data_path, "sheep.csv"))

In [9]:
cattle["state_name"] = cattle["state_name"].str.upper()
cattle["district_name"] = cattle["district_name"].str.upper()
pig["state_name"] = pig["state_name"].str.upper()
pig["district_name"] = pig["district_name"].str.upper()
buffalo["state_name"] = buffalo["state_name"].str.upper()
buffalo["district_name"] = buffalo["district_name"].str.upper()
sheep["state_name"] = sheep["state_name"].str.upper()
sheep["district_name"] = sheep["district_name"].str.upper()
goat["state_name"] = goat["state_name"].str.upper()
goat["district_name"] = goat["district_name"].str.upper()

In [10]:
cattle['state_dist']  = cattle['state_name'] + cattle['district_name']
buffalo['state_dist']  = buffalo['state_name'] + buffalo['district_name']
pig['state_dist']  = pig['state_name'] + pig['district_name']
goat['state_dist']  = goat['state_name'] + goat['district_name']
sheep['state_dist']  = sheep['state_name'] + sheep['district_name']

In [11]:
c1 = pd.merge(
            cattle,
            lgd,
            how="outer",
            left_on="state_dist",
            right_on="state_dist",
            validate="m:1",
            indicator=True,
            suffixes=["_DATA", "_LGD"],
        )

In [12]:
c1

Unnamed: 0,state_name,district_name,female,breed_type_name,upto_one_and_half_years,used_for_breeding_only,used_for_agriculture_only,agriculture_and_breeding,bullock_cart_farm_operations,under_one_ year_female,...,others,total_male,total_female,total,state_dist,St_LGD_code,state,Dt_LGD_code,district,_merge
0,ANDAMAN AND NICOBAR ISLANDS,NICOBARS,0.0,Exotic,0.0,0.0,0.0,0.0,0.0,,...,0.0,0.0,,40.0,ANDAMAN AND NICOBAR ISLANDSNICOBARS,35.0,ANDAMAN AND NICOBAR ISLANDS,603.0,NICOBARS,both
1,ANDAMAN AND NICOBAR ISLANDS,NICOBARS,1.0,Exotic,,,,,,14.0,...,0.0,,40.0,40.0,ANDAMAN AND NICOBAR ISLANDSNICOBARS,35.0,ANDAMAN AND NICOBAR ISLANDS,603.0,NICOBARS,both
2,ANDAMAN AND NICOBAR ISLANDS,NICOBARS,1.0,Indigenous(Desi),,,,,,337.0,...,22.0,,1498.0,1964.0,ANDAMAN AND NICOBAR ISLANDSNICOBARS,35.0,ANDAMAN AND NICOBAR ISLANDS,603.0,NICOBARS,both
3,ANDAMAN AND NICOBAR ISLANDS,NICOBARS,0.0,Indigenous(Desi),169.0,109.0,45.0,75.0,10.0,,...,58.0,466.0,,1964.0,ANDAMAN AND NICOBAR ISLANDSNICOBARS,35.0,ANDAMAN AND NICOBAR ISLANDS,603.0,NICOBARS,both
4,ANDAMAN AND NICOBAR ISLANDS,SOUTH ANDAMANS,1.0,Indigenous(Desi),,,,,,1947.0,...,26.0,,7506.0,9808.0,ANDAMAN AND NICOBAR ISLANDSSOUTH ANDAMANS,35.0,ANDAMAN AND NICOBAR ISLANDS,602.0,SOUTH ANDAMANS,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2850,,,,,,,,,,,...,,,,,UTTARAKHANDRUDRA PRAYAG,5.0,UTTARAKHAND,54.0,RUDRA PRAYAG,right_only
2851,,,,,,,,,,,...,,,,,UTTARAKHANDUDAM SINGH NAGAR,5.0,UTTARAKHAND,56.0,UDAM SINGH NAGAR,right_only
2852,,,,,,,,,,,...,,,,,UTTARAKHANDUTTAR KASHI,5.0,UTTARAKHAND,57.0,UTTAR KASHI,right_only
2853,,,,,,,,,,,...,,,,,WEST BENGALAlipurduar,19.0,WEST BENGAL ...,664.0,Alipurduar,right_only


In [13]:
not_lgd_mapped = c1[(c1["_merge"] == "left_only")][
            [
                "state_name",
                "district_name",
                "state_dist",
            ]
        ]

In [14]:
not_lgd_mapped = not_lgd_mapped.drop_duplicates(subset="state_dist")

In [15]:
len(not_lgd_mapped)

45

In [16]:
result = [
            process.extractOne(i, lgd["state_dist"])
            for i in not_lgd_mapped["state_dist"]
        ]
result = pd.DataFrame(result, columns=["match", "score", "id"])
result.drop("id", axis=1, inplace=True)

In [17]:
result

Unnamed: 0,match,score
0,ASSAMKAMRUP,95
1,ASSAMSOUTH SALMARA MANCACHAR,90
2,ASSAMMARIGAON,92
3,CHHATTISGARHKABIRDHAM,95
4,THE DADRA AND NAGAR HAVELI AND DAMAN AND DIUDAMAN,90
5,THE DADRA AND NAGAR HAVELI AND DAMAN AND DIUDIU,90
6,GUJARATMahisagar,100
7,JAMMU AND KASHMIRBudgam,96
8,JAMMU AND KASHMIRRAJOURI,96
9,KARNATAKACHIKKAMAGALURU,93


In [18]:
not_lgd_proxy_df = (
                pd.DataFrame(not_lgd_mapped["state_dist"], index=None)
                .reset_index()
                .drop("index", axis=1)
        )

In [19]:
mapper_df = pd.concat(
            [not_lgd_proxy_df, result],
            axis=1,
            ignore_index=True,
            names=["original", "match", "score"],
        )

In [20]:
mapper_df = mapper_df[mapper_df[2] >= 90]

In [21]:
mapper_dict = dict(zip(mapper_df[0], mapper_df[1]))
cattle["state_dist"] = cattle["state_dist"].replace(mapper_dict)

In [22]:
c1 = pd.merge(
            cattle,
            lgd,
            how="outer",
            left_on="state_dist",
            right_on="state_dist",
            validate="m:1",
            indicator=True,
            suffixes=["_DATA", "_LGD"],
        )


In [23]:
c1["_merge"].value_counts()

both          2776
right_only      38
left_only        0
Name: _merge, dtype: int64

In [24]:
not_lgd_mapped = c1[(c1["_merge"] == "left_only")][
            [
                "state_name",
                "district_name",
                "state_dist",
            ]
        ]
not_lgd_mapped = not_lgd_mapped.drop_duplicates(subset="state_dist")

In [25]:
len(not_lgd_mapped)

0

In [26]:
result = [
            process.extractOne(i, lgd["state_dist"])
            for i in not_lgd_mapped["state_dist"]
        ]
result = pd.DataFrame(result, columns=["match", "score", "id"])
result.drop("id", axis=1, inplace=True)

In [27]:
not_lgd_proxy_df = (
                pd.DataFrame(not_lgd_mapped["state_dist"], index=None)
                .reset_index()
                .drop("index", axis=1)
        )

In [28]:
mapper_df = pd.concat(
            [not_lgd_proxy_df, result],
            axis=1,
            ignore_index=True,
            names=["original", "match", "score"],
        )

In [29]:
mapper_df = mapper_df[mapper_df[2] >= 90]

In [30]:
mapper_dict = dict(zip(mapper_df[0], mapper_df[1]))
cattle["state_dist"] = cattle["state_dist"].replace(mapper_dict)

In [31]:
c1 = pd.merge(
            cattle,
            lgd,
            how="outer",
            left_on="state_dist",
            right_on="state_dist",
            validate="m:1",
            indicator=True,
            suffixes=["_DATA", "_LGD"],
        )


In [32]:
c1["_merge"].value_counts()

both          2776
right_only      38
left_only        0
Name: _merge, dtype: int64

In [33]:
not_lgd_mapped = c1[(c1["_merge"] == "left_only")][
            [
                "state_name",
                "district_name",
                "state_dist",
            ]
        ]
not_lgd_mapped = not_lgd_mapped.drop_duplicates(subset="state_dist")

In [34]:
len(not_lgd_mapped)

0

In [35]:
not_lgd_mapped.to_csv("cattle_unmapped.csv")

In [36]:
for i in range(0, 2776):
    if (cattle['district_name'][i].upper() == 'SORAIDEU'):
        cattle['district_name'][i] = 'CHARAIDEO'
    if (cattle['district_name'][i].upper() == 'SIBSAGAR'):
        cattle['district_name'][i] = 'SIVASAGAR'
    if (cattle['district_name'][i].upper() == 'MEWAT'):
        cattle['district_name'][i] = 'NUH'
    if (cattle['district_name'][i].upper() == 'KARGIL'):
        cattle['state_name'][i] = 'LADAKH'
    if (cattle['district_name'][i].upper() == 'LEH LADAKH'):
        cattle['state_name'][i] = 'LADAKH'
    if (cattle['district_name'][i].upper() == 'DADRA AND NAGAR HAVELI'):
        cattle['state_name'][i] = 'THE DADRA AND NAGAR HAVELI AND DAMAN AND DIU'
    if (cattle['district_name'][i].upper() == 'GULBARGA'):
        cattle['district_name'][i] = 'KALABURAGI'
    if (cattle['district_name'][i].upper() == 'MYSORE'):
        cattle['district_name'][i] = 'MYSURU'
    if (cattle['district_name'][i].upper() == 'BANGALORE RURAL'):
        cattle['district_name'][i] = 'BENGALURU RURAL'
    if (cattle['district_name'][i].upper() == 'SORAIDEU'):
        cattle['district_name'][i] = 'CHARAIDEO'
    if (cattle['district_name'][i].upper() == 'BELGAUM'):
        cattle['district_name'][i] = 'BELAGAVI'
    if (cattle['district_name'][i].upper() == 'BELLARY'):
        cattle['district_name'][i] = 'BALLARI'
    if (cattle['district_name'][i].upper() in 'NAWANSHAHR (SBS NAGAR)'):
        cattle['district_name'][i] = 'SHAHID BHAGAT SINGH NAGAR'
    if (cattle['district_name'][i].upper() == 'MUKTSAR'):
        cattle['district_name'][i] = 'SRI MUKTSAR SAHIB'
    if (cattle['district_name'][i].upper() == 'ALLAHABAD'):
        cattle['district_name'][i] = 'PRAYAGRAJ'
    if (cattle['district_name'][i].upper() == 'SANT RAVIDAS NAGAR'):
        cattle['district_name'][i] = 'BHADOHI'
    if (cattle['district_name'][i].upper() == 'BARDHAMAN'):
        cattle['district_name'][i] = 'PURBA BARDHAMAN'
    if (cattle['district_name'][i].upper() == 'BIJAPUR') and (cattle['state_name'][i].upper() == 'KARNATAKA'):
        cattle['district_name'][i] = 'VIJAYAPURA' 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [37]:
cattle.to_csv(Path.joinpath(processed_data_path, "cattle.csv"), index=False)

In [38]:
c2 = c1[c1["_merge"] == "both"]

In [39]:
c2 = c2.drop(['_merge','state', 'district'], axis = 1)

In [40]:
c2.columns.to_list()

['state_name',
 'district_name',
 'female',
 'breed_type_name',
 'upto_one_and_half_years',
 'used_for_breeding_only',
 'used_for_agriculture_only',
 'agriculture_and_breeding',
 'bullock_cart_farm_operations',
 'under_one_ year_female',
 'one_to_two_and_half_years',
 'in_milk',
 'dry',
 'not_calved_once',
 'others',
 'total_male',
 'total_female',
 'total',
 'state_dist',
 'St_LGD_code',
 'Dt_LGD_code']

In [41]:
l = ['state_name',
 'district_name',
'state_dist',
 'St_LGD_code',
 'Dt_LGD_code',
 'female',
 'breed_type_name',
 'upto_one_and_half_years',
 'used_for_breeding_only',
 'used_for_agriculture_only',
 'agriculture_and_breeding',
 'bullock_cart_farm_operations',
 'under_one_ year_female',
 'one_to_two_and_half_years',
 'in_milk',
 'dry',
 'not_calved_once',
 'others',
 'total_male',
 'total_female',
 'total']

In [42]:
c2 = c2[l]

In [43]:
c2.to_csv(Path.joinpath(processed_data_path, "cattle_lgd.csv"), index=False)

In [44]:
c2.shape

(2776, 21)

In [45]:
cattle.shape

(2776, 19)

In [4]:
cattle = pd.read_csv(Path.joinpath(processed_data_path, "cattle_lgd.csv"))

In [5]:
cattle.drop(['state_dist'], axis = 1, inplace = True)
cattle.rename(columns={'St_LGD_code':'state_lgd_code', 'Dt_LGD_code':'district_lgd_code'}, inplace = True)

In [6]:
cattle.to_csv(Path.joinpath(processed_data_path, "cattle_lgd.csv"), index=False)

In [7]:
cattle

Unnamed: 0,state_name,district_name,state_lgd_code,district_lgd_code,female,breed_type_name,upto_one_and_half_years,used_for_breeding_only,used_for_agriculture_only,agriculture_and_breeding,bullock_cart_farm_operations,under_one_ year_female,one_to_two_and_half_years,in_milk,dry,not_calved_once,others,total_male,total_female,total
0,ANDAMAN AND NICOBAR ISLANDS,NICOBARS,35,603,0.0,Exotic,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,,40.0
1,ANDAMAN AND NICOBAR ISLANDS,NICOBARS,35,603,1.0,Exotic,,,,,,14.0,10.0,15.0,1.0,0.0,0.0,,40.0,40.0
2,ANDAMAN AND NICOBAR ISLANDS,NICOBARS,35,603,1.0,Indigenous(Desi),,,,,,337.0,362.0,458.0,255.0,64.0,22.0,,1498.0,1964.0
3,ANDAMAN AND NICOBAR ISLANDS,NICOBARS,35,603,0.0,Indigenous(Desi),169.0,109.0,45.0,75.0,10.0,,,,,,58.0,466.0,,1964.0
4,ANDAMAN AND NICOBAR ISLANDS,SOUTH ANDAMANS,35,602,1.0,Indigenous(Desi),,,,,,1947.0,1518.0,2214.0,1550.0,251.0,26.0,,7506.0,9808.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2771,WEST BENGAL,DINAJPUR UTTAR,19,311,0.0,Exotic,3285.0,300.0,800.0,109.0,96.0,,,,,,364.0,4954.0,,67854.0
2772,WEST BENGAL,PURULIA,19,321,0.0,Indigenous(Desi),76292.0,14435.0,366207.0,11247.0,23886.0,,,,,,9535.0,501602.0,,897199.0
2773,WEST BENGAL,PURULIA,19,321,0.0,Exotic,1372.0,275.0,728.0,224.0,335.0,,,,,,236.0,3170.0,,18075.0
2774,WEST BENGAL,PURULIA,19,321,1.0,Exotic,,,,,,3713.0,2597.0,6145.0,1461.0,647.0,342.0,,14905.0,18075.0
