In [1]:
from pathlib import Path
import pandas as pd

In [2]:
dir_path = Path.cwd()
raw_data_path = Path.joinpath(dir_path, "data", "raw")
interim_data_path = Path.joinpath(dir_path, "data", "interim")
ext_data_path = Path.joinpath(dir_path, "data", "external")

In [4]:
df = pd.read_stata(Path.joinpath(raw_data_path, "level7.dta"))
df.columns

Index(['common_id', 'State', 'Sector', 'District', 'Centre_code_Round',
       'FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Level', 'Filler', 'Sl_no',
       'Crop_code', 'other_disp_sale_value', 'all_disp_quant_sold',
       'all_disp_sale_value', 'Rate_Rs', 'value_of_pre_harvest_sale',
       'value_of_harvested_produce', 'value_of_by_products', 'total_value',
       'Blank', 'NSC', 'Multiplier', 'w'],
      dtype='object')

In [5]:
df.drop(['Centre_code_Round',
       'FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Blank', 'Level', 'Filler', 'NSC'], axis = 1, inplace = True)

In [6]:
var_names = [x.lower() for x in df.columns]
df.columns = var_names

In [7]:
if df["common_id"].is_unique:
    print("Common ID is unique for Level 7")
else:
    print("Common ID not unique for Level 7")

Common ID not unique for Level 7


In [8]:
df

Unnamed: 0,common_id,state,sector,district,sl_no,crop_code,other_disp_sale_value,all_disp_quant_sold,all_disp_sale_value,rate_rs,value_of_pre_harvest_sale,value_of_harvested_produce,value_of_by_products,total_value,multiplier,w
0,72227201,1,1,5,1,104.0,,50.0,900.0,18.0,,7200.0,3000.0,10200.0,170500,1705.0
1,72227201,1,1,5,9,9999.0,,,,,,7200.0,3000.0,10200.0,170500,1705.0
2,72227202,1,1,5,1,104.0,,100.0,1800.0,18.0,,4500.0,1800.0,6300.0,170500,1705.0
3,72227202,1,1,5,9,9999.0,,,,,,4500.0,1800.0,6300.0,170500,1705.0
4,72227301,1,1,5,1,104.0,,150.0,2700.0,18.0,,3600.0,2200.0,5800.0,403000,4030.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109915,70112501,36,1,31,4,104.0,,7500.0,131250.0,17.5,,131250.0,,131250.0,100500,1005.0
109916,70112501,36,1,31,9,9999.0,,,,,300000.0,427025.0,4500.0,731525.0,100500,1005.0
109917,70112502,36,1,31,1,1101.0,,1640.0,85280.0,52.0,,85280.0,,85280.0,100500,1005.0
109918,70112502,36,1,31,2,101.0,,2150.0,26450.0,12.3,,35055.0,2000.0,37055.0,100500,1005.0


In [9]:
labels = pd.read_csv(Path.joinpath(ext_data_path, "crop_code_labels.csv"))

In [10]:
crop_list = list(labels['crop'])
code_list = list(labels['code'])

In [11]:
crop_labels = {}

In [12]:
for i in range(0, len(code_list)):
  c = crop_list[i].split(".")
  cr = c[0].split(" ")
  crop_labels[code_list[i]] = cr[0]

In [13]:
df.columns

Index(['common_id', 'state', 'sector', 'district', 'sl_no', 'crop_code',
       'other_disp_sale_value', 'all_disp_quant_sold', 'all_disp_sale_value',
       'rate_rs', 'value_of_pre_harvest_sale', 'value_of_harvested_produce',
       'value_of_by_products', 'total_value', 'multiplier', 'w'],
      dtype='object')

In [14]:
df['crop_code'] = df['crop_code'].map(crop_labels)

In [15]:
df

Unnamed: 0,common_id,state,sector,district,sl_no,crop_code,other_disp_sale_value,all_disp_quant_sold,all_disp_sale_value,rate_rs,value_of_pre_harvest_sale,value_of_harvested_produce,value_of_by_products,total_value,multiplier,w
0,72227201,1,1,5,1,maize,,50.0,900.0,18.0,,7200.0,3000.0,10200.0,170500,1705.0
1,72227201,1,1,5,9,,,,,,,7200.0,3000.0,10200.0,170500,1705.0
2,72227202,1,1,5,1,maize,,100.0,1800.0,18.0,,4500.0,1800.0,6300.0,170500,1705.0
3,72227202,1,1,5,9,,,,,,,4500.0,1800.0,6300.0,170500,1705.0
4,72227301,1,1,5,1,maize,,150.0,2700.0,18.0,,3600.0,2200.0,5800.0,403000,4030.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109915,70112501,36,1,31,4,maize,,7500.0,131250.0,17.5,,131250.0,,131250.0,100500,1005.0
109916,70112501,36,1,31,9,,,,,,300000.0,427025.0,4500.0,731525.0,100500,1005.0
109917,70112502,36,1,31,1,cotton,,1640.0,85280.0,52.0,,85280.0,,85280.0,100500,1005.0
109918,70112502,36,1,31,2,paddy,,2150.0,26450.0,12.3,,35055.0,2000.0,37055.0,100500,1005.0


In [16]:
csv_path = Path.joinpath(interim_data_path, "level7.csv")
df.to_csv(csv_path, index=False)