In [55]:
from pathlib import Path
import pandas as pd

In [56]:
dir_path = Path.cwd()
raw_data_path = Path.joinpath(dir_path, "data", "raw")
interim_data_path = Path.joinpath(dir_path, "data", "interim")
ext_data_path = Path.joinpath(dir_path, "data", "external")

In [57]:
df = pd.read_stata(Path.joinpath(raw_data_path, "level6.dta"))
df.columns

Index(['common_id', 'State', 'Sector', 'District', 'Centre_code_Round',
       'FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Level', 'Filler', 'Sl_No',
       'Crop_code', 'Unit_code', 'area_of_irri_land',
       'produced_from_irri_land', 'area_of_un_irri_land',
       'prod_from_un_irri_land', 'total_quant', 'land_under_pre_harvestsale',
       'major_disp_sold', 'satisfied_sale_outcome', 'major_disp_quant_sold',
       'major_disp_sale_value', 'other_disp_quant_sold', 'NSC', 'Multiplier',
       'w'],
      dtype='object')

In [58]:
df.drop(['Centre_code_Round',
       'FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Level', 'Filler', 'NSC'], axis = 1, inplace = True)

In [59]:
df.columns

Index(['common_id', 'State', 'Sector', 'District', 'Sl_No', 'Crop_code',
       'Unit_code', 'area_of_irri_land', 'produced_from_irri_land',
       'area_of_un_irri_land', 'prod_from_un_irri_land', 'total_quant',
       'land_under_pre_harvestsale', 'major_disp_sold',
       'satisfied_sale_outcome', 'major_disp_quant_sold',
       'major_disp_sale_value', 'other_disp_quant_sold', 'Multiplier', 'w'],
      dtype='object')

In [60]:
var_names = [x.lower() for x in df.columns]
df.columns = var_names

In [61]:
if df["common_id"].is_unique:
    print("Common ID is unique for Level 6")
else:
    print("Common ID not unique for Level 6")

Common ID not unique for Level 6


In [62]:
len(pd.unique(df['common_id']))

41666

In [63]:
df['common_id']

0         72227201
1         72227201
2         72227202
3         72227202
4         72227301
            ...   
109915    70112501
109916    70112501
109917    70112502
109918    70112502
109919    70112502
Name: common_id, Length: 109920, dtype: int32

In [64]:
df

Unnamed: 0,common_id,state,sector,district,sl_no,crop_code,unit_code,area_of_irri_land,produced_from_irri_land,area_of_un_irri_land,prod_from_un_irri_land,total_quant,land_under_pre_harvestsale,major_disp_sold,satisfied_sale_outcome,major_disp_quant_sold,major_disp_sale_value,other_disp_quant_sold,multiplier,w
0,72227201,1,1,5,1,104.0,1.0,,,0.37,400.0,400.0,,1.0,1.0,50.0,900.0,,170500,1705.0
1,72227201,1,1,5,9,9999.0,,,,0.37,,,,,,,,,170500,1705.0
2,72227202,1,1,5,1,104.0,1.0,,,0.37,250.0,250.0,,1.0,2.0,100.0,1800.0,,170500,1705.0
3,72227202,1,1,5,9,9999.0,,,,0.37,,,,,,,,,170500,1705.0
4,72227301,1,1,5,1,104.0,1.0,,,0.50,200.0,200.0,,1.0,2.0,150.0,2700.0,,403000,4030.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109915,70112501,36,1,31,4,104.0,1.0,4.0,7500.0,,,7500.0,,1.0,1.0,7500.0,131250.0,,100500,1005.0
109916,70112501,36,1,31,9,9999.0,,20.5,,,,,5.0,,,,,,100500,1005.0
109917,70112502,36,1,31,1,1101.0,1.0,4.0,1640.0,,,1640.0,,1.0,1.0,1640.0,85280.0,,100500,1005.0
109918,70112502,36,1,31,2,101.0,1.0,1.5,2850.0,,,2850.0,,1.0,2.0,2150.0,26450.0,,100500,1005.0


In [65]:
labels = pd.read_csv(Path.joinpath(ext_data_path, "crop_code_labels.csv"))

In [66]:
crop_list = list(labels['crop'])
code_list = list(labels['code'])

In [67]:
crop_labels = {}

In [68]:
for i in range(0, len(code_list)):
  c = crop_list[i].split(".")
  cr = c[0].split(" ")
  crop_labels[code_list[i]] = cr[0]

In [69]:
unit_labels = {1: 'kg', 2:'number'}
maj_disp_labels = {1:'local market (incl. local traders)', 2: 'APMC market', 3: 'input dealers', 4: 'cooperative', 5: 'Government agencies' , 6: 'Farmer producer organisations (FPO)', 7: 'private processors', 8: 'contract farming sponsors/ companies', 9: 'others'}
satisfactory_labels = {1: 'satisfactory', 2: 'not satisfactory: lower than market price', 3: 'delayed payments', 4: 'deductions for loans borrowed', 5: 'faulty weighing and grading', 9: 'other cause of dissatisfaction'}

In [74]:
df.columns

Index(['common_id', 'state', 'sector', 'district', 'sl_no', 'crop_code',
       'unit_code', 'area_of_irri_land', 'produced_from_irri_land',
       'area_of_un_irri_land', 'prod_from_un_irri_land', 'total_quant',
       'land_under_pre_harvestsale', 'major_disp_sold',
       'satisfied_sale_outcome', 'major_disp_quant_sold',
       'major_disp_sale_value', 'other_disp_quant_sold', 'multiplier', 'w'],
      dtype='object')

In [75]:
print(len(pd.unique(df['unit_code'])))
print(len(pd.unique(df['major_disp_sold'])))
print(len(pd.unique(df['satisfied_sale_outcome'])))
print(len(pd.unique(df['crop_code'])))

3
10
7
118


In [76]:
df['crop_code'] = df['crop_code'].map(crop_labels)
df['unit_code'] = df['unit_code'].map(unit_labels)
df['major_disp_sold'] = df['major_disp_sold'].map(maj_disp_labels)
df['satisfied_sale_outcome'] = df['satisfied_sale_outcome'].map(satisfactory_labels)

In [77]:
df

Unnamed: 0,common_id,state,sector,district,sl_no,crop_code,unit_code,area_of_irri_land,produced_from_irri_land,area_of_un_irri_land,prod_from_un_irri_land,total_quant,land_under_pre_harvestsale,major_disp_sold,satisfied_sale_outcome,major_disp_quant_sold,major_disp_sale_value,other_disp_quant_sold,multiplier,w
0,72227201,1,1,5,1,,,,,0.37,400.0,400.0,,,,50.0,900.0,,170500,1705.0
1,72227201,1,1,5,9,,,,,0.37,,,,,,,,,170500,1705.0
2,72227202,1,1,5,1,,,,,0.37,250.0,250.0,,,,100.0,1800.0,,170500,1705.0
3,72227202,1,1,5,9,,,,,0.37,,,,,,,,,170500,1705.0
4,72227301,1,1,5,1,,,,,0.50,200.0,200.0,,,,150.0,2700.0,,403000,4030.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109915,70112501,36,1,31,4,,,4.0,7500.0,,,7500.0,,,,7500.0,131250.0,,100500,1005.0
109916,70112501,36,1,31,9,,,20.5,,,,,5.0,,,,,,100500,1005.0
109917,70112502,36,1,31,1,,,4.0,1640.0,,,1640.0,,,,1640.0,85280.0,,100500,1005.0
109918,70112502,36,1,31,2,,,1.5,2850.0,,,2850.0,,,,2150.0,26450.0,,100500,1005.0


In [78]:
csv_path = Path.joinpath(interim_data_path, "level6.csv")
df.to_csv(csv_path, index=False)