In [61]:
import pandas as pd
from pathlib import Path

In [62]:
dir_path = Path.cwd()
raw_data_path = Path.joinpath(dir_path, "data", "raw")
interim_data_path = Path.joinpath(dir_path, "data", "interim")
ext_data_path = Path.joinpath(dir_path, "data", "external")

In [63]:
df = pd.read_stata(Path.joinpath(raw_data_path, "level8.dta"))
df.columns

Index(['common_id', 'State', 'Sector', 'District', 'Centre_code_Round',
       'FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Level', 'Filler', 'Sl_no',
       'Sl_no_crop', 'Crop_code', 'Inputs_from_where_procur',
       'Inputs_qual_adeq_code', 'Inputs_paid_out_exp', 'Inputs_imputed_exp',
       'Blank', 'NSC', 'Multiplier', 'w'],
      dtype='object')

In [64]:
df.drop(['Centre_code_Round',
       'FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Blank', 'Level', 'Filler', 'NSC'], axis = 1, inplace = True)

In [65]:
var_names = [x.lower() for x in df.columns]
df.columns = var_names

In [66]:
if df["common_id"].is_unique:
    print("Common ID is unique for Level 8")
else:
    print("Common ID not unique for Level 8")

Common ID not unique for Level 8


In [67]:
df

Unnamed: 0,common_id,state,sector,district,sl_no,sl_no_crop,crop_code,inputs_from_where_procur,inputs_qual_adeq_code,inputs_paid_out_exp,inputs_imputed_exp,multiplier,w
0,72227201,1,1,5,1,1.0,104.0,1.0,1.0,245.0,,170500,1705.0
1,72227201,1,1,5,6,,,1.0,1.0,890.0,,170500,1705.0
2,72227201,1,1,5,8,,,10.0,1.0,,1000.0,170500,1705.0
3,72227201,1,1,5,14,,,,,1000.0,800.0,170500,1705.0
4,72227201,1,1,5,15,,,,,1200.0,,170500,1705.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
340769,70112502,36,1,31,17,,,,,12000.0,,100500,1005.0
340770,70112502,36,1,31,18,,,,,12990.0,,100500,1005.0
340771,70112502,36,1,31,20,,,,,38000.0,,100500,1005.0
340772,70112502,36,1,31,21,,,,,4500.0,400.0,100500,1005.0


In [68]:
labels = pd.read_csv(Path.joinpath(ext_data_path, "crop_code_labels.csv"))

In [86]:
crop_list = list(labels['crop'])
code_list = list(labels['code'])

In [89]:
crop_labels = {}
for i in range(0, len(code_list)):
  c = crop_list[i].split(".")
  cr = c[0].split(" ")
  crop_labels[float(code_list[i])] = cr[0]

In [90]:
print(crop_labels)

{1.0: 'Cereals', 101.0: 'paddy', 102.0: 'jowar', 103.0: 'bajra', 104.0: 'maize', 105.0: 'ragi', 106.0: 'wheat', 107.0: 'barley', 108.0: 'small', 188.0: 'other', 2.0: 'Pulses', 201.0: 'gram', 202.0: 'tur', 203.0: 'urad', 204.0: 'moong', 205.0: 'masur', 206.0: 'horse', 207.0: 'beans', 208.0: 'peas', 288.0: 'other', 4.0: 'Sugar', 401.0: 'sugarcane', 402.0: 'palmvriah', 488.0: 'other', 5.0: 'Condiments', 501.0: 'pepper', 502.0: 'chillies', 503.0: 'ginger', 504.0: 'turmeric', 505.0: 'cardamom', 506.0: 'cardamom', 507.0: 'betel', 508.0: 'garlic', 509.0: 'coriander', 510.0: 'tamarind', 511.0: 'cumin', 512.0: 'fennel', 513.0: 'nutmeg', 514.0: 'fenugreek', 515.0: 'cloves', 516.0: 'cinnamon', 517.0: 'cocoa', 518.0: 'kacholam', 519.0: 'betelvine', 588.0: 'Other', 6.0: 'Fruits', 601.0: 'mangoes', 602.0: 'orange', 603.0: 'mosambi', 604.0: 'lemon', 605.0: 'othercitrous', 606.0: 'banana', 607.0: 'table', 608.0: 'wine', 609.0: 'apple', 610.0: 'pear', 611.0: 'peaches', 612.0: 'plum', 613.0: 'kiwi', 614

In [91]:
where_procure_labels = {1:'local market (incl. local traders)',2: 'APMC market', 3: 'input dealers', 4: 'cooperative', 5: 'Government agencies', 6: 'Farmer producer organisations (FPO)', 7: 'private processors', 8: 'contract farming sponsors/ companies', 10: 'own farm', 9: 'others'}
quality_labels = {1: 'good', 2: 'satisfactory', 3: 'poor', 4: 'dont know'}

In [92]:
df['input'] = df['sl_no']

In [93]:
input_labels = {1: 'seeds', 2: 'seeds', 3: 'seeds', 4: 'seeds', 5: 'seeds', 6: 'chemical fertilizers', 7: 'bio-fertilizers', 8:'manures', 9:'plant protection materials chemical', 10: 'plant protection materials bio-pesticides', 11: 'diesel', 12:'electricity', 13: 'irrigation', 14: 'labour human', 15: 'labour animal', 16: 'minor repair and maintenance of machinery and equipment used in crop production', 17: 'interest on loans utilised for the purpose of crop production', 18: 'cost of hiring of machinery and equipment for crop production', 19: 'cost of crop insurance', 20: 'lease rent for land used for crop production', 21: 'other expenses for crop production', 22: 'total'}

In [94]:
#for i in range(0, len((df[df['crop_code'].isnull()==0]['crop_code']))):
#    df[df['crop_code'].isnull()==0]['crop_code'][i] = int(df[df['crop_code'].isnull()==0]['crop_code'][i])

In [95]:
print(df[df['crop_code'].isnull()==0])

        common_id  state  sector  district  sl_no  sl_no_crop  crop_code  \
0        72227201      1       1         5      1         1.0      104.0   
8        72227202      1       1         5      1         1.0      104.0   
16       72227301      1       1         5      1         1.0      104.0   
17       72227301      1       1         5      2         2.0     1488.0   
24       72227302      1       1         5      1         1.0      104.0   
...           ...    ...     ...       ...    ...         ...        ...   
340743   70112501     36       1        31      2         2.0      101.0   
340744   70112501     36       1        31      3         3.0     1804.0   
340745   70112501     36       1        31      4         4.0      104.0   
340758   70112502     36       1        31      1         1.0     1101.0   
340759   70112502     36       1        31      2         2.0      101.0   

                  inputs_from_where_procur inputs_qual_adeq_code  \
0       local marke

In [97]:
df['input'] = df['input'].map(input_labels)
df['inputs_from_where_procur'] = df['inputs_from_where_procur'].map(where_procure_labels)
df['inputs_qual_adeq_code'] = df['inputs_qual_adeq_code'].map(quality_labels)
df['crop_code'] = df['crop_code'].map(crop_labels)

In [98]:
df


Unnamed: 0,common_id,state,sector,district,sl_no,sl_no_crop,crop_code,inputs_from_where_procur,inputs_qual_adeq_code,inputs_paid_out_exp,inputs_imputed_exp,multiplier,w,input
0,72227201,1,1,5,1,1.0,maize,,,245.0,,170500,1705.0,
1,72227201,1,1,5,6,,,,,890.0,,170500,1705.0,
2,72227201,1,1,5,8,,,,,,1000.0,170500,1705.0,
3,72227201,1,1,5,14,,,,,1000.0,800.0,170500,1705.0,
4,72227201,1,1,5,15,,,,,1200.0,,170500,1705.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340769,70112502,36,1,31,17,,,,,12000.0,,100500,1005.0,
340770,70112502,36,1,31,18,,,,,12990.0,,100500,1005.0,
340771,70112502,36,1,31,20,,,,,38000.0,,100500,1005.0,
340772,70112502,36,1,31,21,,,,,4500.0,400.0,100500,1005.0,


In [99]:
csv_path = Path.joinpath(interim_data_path, "level8.csv")
df.to_csv(csv_path, index=False)