## TZ Survey Analysis

In [1]:
# import necessary libraries
import os
import pandas as pd
import numpy as np

In [2]:
# The first step is to fetch all the data

# determine the relative path of the folder holding the csv files
folder_path = "./TZA_2020_NPS-R5_v02_M_CSV"

In [3]:
# Initialize an empty dictionary to store the data
data_dict = {}

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        # Create the variable name (removing the .csv extension)
        var_name = os.path.splitext(filename)[0]
        
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file into a DataFrame
        data_dict[var_name] = pd.read_csv(file_path)

In [4]:
import modin.pandas as pd

In [5]:
# List of all datasets:
data_dict.keys()

dict_keys(['ag_sec_4b', 'ag_filters', 'ag_sec_12b', 'npsy5.panel.key', 'hh_sec_q1', 'ag_sec_4a', 'cm_sec_f_id', 'ag_sec_12a', 'hh_sec_q2', 'hh_sec_e3', 'hh_sec_e2', 'ag_sec_5a', 'hh_sec_e1', 'hh_sec_i2', 'ag_sec_5b', 'cm_sec_g', 'hh_sec_a', 'hh_sec_v', 'cm_sec_f', 'cm_sec_d', 'lf_sec_08', 'hh_sec_c', 'hh_sec_b', 'cm_sec_e', 'hh_sec_ja1', 'cm_sec_a', 'hh_sec_f', 'hh_sec_p', 'hh_sec_g', 'hh_sec_o2', 'ag_sec_3b', 'cm_sec_b', 'hh_sec_r', 'hh_sec_s', 'hh_sec_d', 'hh_sec_o1', 'ag_sec_3a', 'cm_sec_c', 'hh_sec_u2', 'lf_sec_02', 'hh_sec_i', 'hh_sec_h', 'lf_sec_03', 'ag_sec_11', 'hh_sec_k', 'ag_sec_10', 'npsy5.child.anthro', 'lf_sec_04', 'consumption_real_y5', 'consumption_real_y4', 'hh_sec_n', 'lf_sec_05', 'ag_sec_01', 'lf_sec_07', 'hh_sec_l', 'hh_sec_m', 'lf_sec_06', 'ag_sec_02', 'hh_sec_j3', 'ag_sec_6b', 'lf_sec_04a', 'hh_sec_j1', 'ag_sec_6a', 'ag_sec_7a', 'hh_sec_j4', 'ag_sec_7b', 'cm_sec_d2'])

From the review of the data, it seems like the important datasets are:
1. hh_sec_a {y5_hhid}
2. hh_sec_b {y5_hhid, indidy5}
3. hh_sec_e2 (occupation data) {y5_hhid, indidy5}
3. hh_sec_e3 (firewood collection shows source of energy) {y5_hhid, indidy5}
4. hh_seg_g (financial status) {y5_hhid, indidy5}
5. hh_sec_j1 (food consumed) {y5_hhid, itemcode
6. hh_sec_m (household assets) {y5_hhid, itemcode}

In [6]:
hh_merged = data_dict['hh_sec_a'].\
    merge(data_dict['hh_sec_b'],
                             how = 'left',
                             on = 'y5_hhid',suffixes = ('_sx_a','_sx_b')).\
    merge(data_dict['hh_sec_e2'],
           how = 'left',
           left_on = ['y5_hhid', 'indidy5'],
           right_on = ['y5_hhid', 'indidy5'], suffixes = ('_sx_ab','_sx_e2')).\
    merge(data_dict['hh_sec_e3'],
           how = 'left',
           left_on = ['y5_hhid', 'indidy5'],
           right_on = ['y5_hhid', 'indidy5'],suffixes = ('_sx_abe2','_sx_e3')).\
    merge(data_dict['hh_sec_g'],
           how = 'left',
           left_on = ['y5_hhid', 'indidy5'],
           right_on = ['y5_hhid', 'indidy5'],suffixes = ('_sx_abe2e3','_sx_g')).\
    merge(data_dict['hh_sec_j1'],
           how = 'left',
           left_on = 'y5_hhid',
           right_on = 'y5_hhid',suffixes = ('_sx_abe2e3g','_sx_j1')).\
    merge(data_dict['hh_sec_m'],
           how = 'left',
           left_on = 'y5_hhid',
           right_on = 'y5_hhid',suffixes = ('_sx_abe2e3gj1','_sx_m'))

In [None]:
#A bit of cleaning
hh_merged_nona = hh_merged.dropna(axis=1, how='all')
nan_columns = hh_merged_nona.columns[hh_merged_nona.eq('NaN').all()]
hh_merged_nona = hh_merged_nona.drop(columns=nan_columns)

confidential_columns = hh_merged_nona.columns[hh_merged_nona.eq('**CONFIDENTIAL**').all()]
hh_merged_nona_nocon = hh_merged_nona.drop(columns=confidential_columns)


columns_to_drop = []
for column in hh_merged_nona_nocon.columns:
    unique_values = hh_merged_nona_nocon[column].unique()
    if len(unique_values) == 2 and 'NaN' in unique_values and '**CONFIDENTIAL**' in unique_values:
        columns_to_drop.append(column)


df_cleaned = hh_merged_nona_nocon.drop(columns=columns_to_drop)

In [None]:
np.nan in hh_merged_nona_nocon['hh_a03_1'].unique()

In [None]:
df_cleaned