In [1]:
from pathlib import Path
import pandas as pd

In [2]:
dir_path = Path.cwd()
raw_data_path = Path.joinpath(dir_path, "data", "raw")
interim_data_path = Path.joinpath(dir_path, "data", "interim")

In [3]:
df = pd.read_stata(Path.joinpath(raw_data_path, "level7.dta"))
df.columns

Index(['common_id', 'State', 'Sector', 'District', 'Schedule_ID',
       'FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Level', 'Filler', 'Serial_no',
       'Owned_Area_000sqm', 'Owned_Value_Rs', 'Blank', 'NSC', 'Multiplier',
       'w'],
      dtype='object')

In [4]:
df.drop(['FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Level', 'Filler', 'Blank','NSC'], axis = 1, inplace = True)

In [5]:
df

Unnamed: 0,common_id,State,Sector,District,Schedule_ID,Serial_no,Owned_Area_000sqm,Owned_Value_Rs,Multiplier,w
0,53335201,1,1,5,771,1,62.14,415000,31000,310.0
1,53335201,1,1,5,771,4,16.55,24000,31000,310.0
2,53335201,1,1,5,771,10,,439000,31000,310.0
3,53335301,1,1,5,771,1,75.87,425000,241800,2418.0
4,53335301,1,1,5,771,4,18.15,18500,241800,2418.0
...,...,...,...,...,...,...,...,...,...,...
236868,76272501,36,2,31,771,9,,30000,93500,935.0
236869,76272501,36,2,31,771,10,,1385000,93500,935.0
236870,76272502,36,2,31,771,1,80.00,1950000,93500,935.0
236871,76272502,36,2,31,771,9,,45000,93500,935.0


In [6]:
var_names = [x.lower() for x in df.columns]
df.columns = var_names

In [7]:
if df["common_id"].is_unique:
    print("Common ID is unique for Level 7")
else:
    print("Common ID not unique for Level 7")

Common ID not unique for Level 7


In [8]:
df.columns

Index(['common_id', 'state', 'sector', 'district', 'schedule_id', 'serial_no',
       'owned_area_000sqm', 'owned_value_rs', 'multiplier', 'w'],
      dtype='object')

In [9]:
df['building_type'] = df['serial_no']

In [10]:
building_labels = {1:'residential building - used as dwelling by household members',
                    2: 'residential building - other residential building within the village/town',
                    3: 'residential building - other residential building outside the village/town',
                    4: 'building used for farm business - animal shed',
                    5: 'building used for farm business - others such as barn, warehouse (incl. cold storage), farm house, etc',
                    6: 'building used for non-farm business(workplace, workshop, mfg. unit, shop, etc.)',
                    7: 'building for other purposes (charitable, recreational like cinema hall, temple etc.)',
                    8: 'work-in-progress (structure under construction)',
                    9: 'other constructions (well, borewell, tubewell, field distribution system, etc.)',
                    10: 'total'}

In [11]:
df['building_type'] = df['building_type'].map(building_labels)

In [12]:
df

Unnamed: 0,common_id,state,sector,district,schedule_id,serial_no,owned_area_000sqm,owned_value_rs,multiplier,w,building_type
0,53335201,1,1,5,771,1,62.14,415000,31000,310.0,residential building - used as dwelling by hou...
1,53335201,1,1,5,771,4,16.55,24000,31000,310.0,building used for farm business - animal shed
2,53335201,1,1,5,771,10,,439000,31000,310.0,total
3,53335301,1,1,5,771,1,75.87,425000,241800,2418.0,residential building - used as dwelling by hou...
4,53335301,1,1,5,771,4,18.15,18500,241800,2418.0,building used for farm business - animal shed
...,...,...,...,...,...,...,...,...,...,...,...
236868,76272501,36,2,31,771,9,,30000,93500,935.0,"other constructions (well, borewell, tubewell,..."
236869,76272501,36,2,31,771,10,,1385000,93500,935.0,total
236870,76272502,36,2,31,771,1,80.00,1950000,93500,935.0,residential building - used as dwelling by hou...
236871,76272502,36,2,31,771,9,,45000,93500,935.0,"other constructions (well, borewell, tubewell,..."


In [13]:
csv_path = Path.joinpath(interim_data_path, "level7.csv")
df.to_csv(csv_path, index=False)