In [1]:
from pathlib import Path
import pandas as pd

In [2]:
dir_path = Path.cwd()
raw_data_path = Path.joinpath(dir_path, "data", "raw")
interim_data_path = Path.joinpath(dir_path, "data", "interim")

In [3]:
df = pd.read_stata(Path.joinpath(raw_data_path, "level2.dta"))
df.columns

Index(['common_id', 'State', 'Sector', 'District', 'Schedule_ID',
       'FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Level', 'Filler', 'Person_serial_no',
       'Relation_to_head', 'Gender', 'Age', 'Highest_edu_level_attained',
       'Deposit_acc_in_CB_RRB_CoOp', 'Deposit_acc_in_Post_Office',
       'Deposit_acc_in_NBFC', 'Contributing_to_CoOp_SHG_JLG',
       'Deposit_in_non_instt_Agency', 'Owns_any_land',
       'Owns_any_agricultural_land', 'Holding_a_credit_debit_card',
       'CrDebitCard_during_last_365days', 'Having_an_ewallet',
       'Ewallet_during_last_365days', 'Blank', 'NSC', 'Multiplier', 'w'],
      dtype='object')

In [4]:
df.drop(['FSU_Serial_No', 'Round', 'Schedule', 'Sample', 'NSS_Region', 'Stratum',
       'Sub_Stratum', 'Sub_Round', 'FOD_Sub_Region', 'Second_stage_stratum_no',
       'Sample_hhld_No', 'Visit_number', 'Level', 'Filler', 'Blank','NSC'], axis = 1, inplace = True)

In [5]:
var_names = [x.lower() for x in df.columns]
df.columns = var_names

In [6]:
if df["common_id"].is_unique:
    print("Common ID is unique for Level 2")
else:
    print("Common ID not unique for Level 2")

Common ID not unique for Level 2


In [7]:
df.shape

(495573, 23)

In [8]:
head_rel_labels = {1: 'self', 
                2: 'spouse of head',
                3: 'married child',
                4: 'spouse of married child',
                5: 'unmarried child', 
                6: 'grand child', 
                7: 'father/mother/ father-in-law/ mother-in-law',
                8: 'brother / sister/ brother-in-law/ sister-in-law/other relatives',
                9: 'servant/employees/ other non-relatives'}

In [9]:
gender_labels = {1: 'male', 2: 'female', 3: 'transgenderl'}

In [10]:
education_labels = {1: 'not literate',
                    2: 'literate: below primary',
                    3: 'primary',
                    4: 'upper primary/middle',
                    5: 'secondary',
                    6: 'higher secondary',
                    7: 'diploma /certificate course (upto secondary)',
                    8: 'diploma/certificate course (higher secondary)',
                    10: 'diploma/certificate course(graduation & above)',
                    11: 'graduate',
                    12: 'post graduate and above'}

In [11]:
bank_labels = {1: 'yes with banking services taken only from bank branch',
                2: 'yes with banking services taken only from bank mitra',
                3: 'yes with banking services taken from bank branch & bank mitra',
                4: 'no account'}

In [12]:
df['relation_to_head'] = df['relation_to_head'].map(head_rel_labels)
df['gender'] = df['gender'].map(gender_labels)
df['highest_edu_level_attained'] = df['highest_edu_level_attained'].map(education_labels)
df['deposit_acc_in_cb_rrb_coop'] = df['deposit_acc_in_cb_rrb_coop'].map(bank_labels)

In [13]:
df

Unnamed: 0,common_id,state,sector,district,schedule_id,person_serial_no,relation_to_head,gender,age,highest_edu_level_attained,...,contributing_to_coop_shg_jlg,deposit_in_non_instt_agency,owns_any_land,owns_any_agricultural_land,holding_a_credit_debit_card,crdebitcard_during_last_365days,having_an_ewallet,ewallet_during_last_365days,multiplier,w
0,53335201,1,1,5,771,1,self,male,65,not literate,...,2,2,1,1.0,2.0,,2.0,,31000,310.0
1,53335201,1,1,5,771,2,spouse of head,female,60,not literate,...,2,2,2,,2.0,,2.0,,31000,310.0
2,53335201,1,1,5,771,3,unmarried child,male,35,upper primary/middle,...,2,2,2,,2.0,,2.0,,31000,310.0
3,53335201,1,1,5,771,4,unmarried child,male,29,upper primary/middle,...,2,2,2,,2.0,,2.0,,31000,310.0
4,53335201,1,1,5,771,5,unmarried child,female,23,not literate,...,2,2,2,,2.0,,2.0,,31000,310.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495568,76272601,36,2,31,771,4,unmarried child,female,7,literate: below primary,...,2,2,2,,,,,,149600,1496.0
495569,76272602,36,2,31,771,1,self,male,49,graduate,...,2,2,2,,2.0,,2.0,,149600,1496.0
495570,76272602,36,2,31,771,2,spouse of head,female,44,higher secondary,...,2,2,2,,2.0,,2.0,,149600,1496.0
495571,76272602,36,2,31,771,3,unmarried child,female,14,upper primary/middle,...,2,2,2,,,,,,149600,1496.0


In [14]:
csv_path = Path.joinpath(interim_data_path, "level2.csv")
df.to_csv(csv_path, index=False)