# Accessing Demographic Data
- [Uganda Department of Statistics Website](https://www.ubos.org/)
- [Dataset Repository](https://www.ubos.org/explore-statistics/0/)

In [None]:
#importing relevant packages
%run /Users/thomasadler/Desktop/Capstone_Docs/packages.py

We have manually chosen the relevant columns from the Ugandan Statistics datasets for multiple reasons:


- every region (4) has its own dataset, with inconsistent naming of columns and sheets
- there are multiple sheets within each excel workbook, each with an inconsistent number of indexes
- manually choosing an initial set of relevant variables is easier when visualizing the full names in an excel workbook and comparing those with other columns
- the number of data points is manageable in excel and does not run the risk of the data being cut
- minimal data processing has been carried out to avoid any mistakes

In [None]:
#defining working directory
filepath = '/Users/thomasadler/Desktop/Capstone_Docs/raw_demographic/'

In [None]:
#defining our four regions
regions_df = ['eastern', 'western', 'central', 'northern']

In [None]:
#loading our four datasets
eastern_df=pd.DataFrame(pd.read_excel(f'{filepath}{regions_df[0]}_demographic.xlsx'))
western_df=pd.DataFrame(pd.read_excel(f'{filepath}{regions_df[1]}_demographic.xlsx'))
central_df=pd.DataFrame(pd.read_excel(f'{filepath}{regions_df[2]}_demographic.xlsx'))
northern_df=pd.DataFrame(pd.read_excel(f'{filepath}{regions_df[3]}_demographic.xlsx'))

In [None]:
#defining our four regions datasets
regions_df = [eastern_df, western_df, central_df, northern_df]

In [None]:
#joining all datasets on similar columns
demographic = pd.concat(regions_df, axis=0)

#check
demographic.head()

In [None]:
#keeping our raw dataset untouched
total_demographic=demographic.copy()

In [None]:
#check
total_demographic.head()

In [None]:
#check
total_demographic.info()

In [None]:
#check proportion of missing values
total_demographic.isna().mean()

We fill missing values of a parish by its subcounty average. Due to the low number of missing values, we assume that the average is a good representation of the subcounty's situation. 

In [None]:
#fill missing parish columns with subcounty average
fill_columns=['perc_hh_temp_dwelling', 'perc_pop_5km_dist_primary', 'perc_hh_piped_water','perc_hh_borehole', 'perc_hh_toilet']

for col in fill_columns:
    total_demographic[col] = total_demographic[col].fillna(total_demographic.groupby('subcounty')[col].transform('mean'))

In [None]:
#check
total_demographic.isna().sum()

For those that still have missing values, due to the fact that the whole subcounty does not have values, we fill the missing values with the country's average.

In [None]:
#fill subsequent missing parish columns with country average
fill_columns_2=['perc_hh_temp_dwelling', 'perc_hh_toilet']

for col in fill_columns_2:
    total_demographic[col] = total_demographic[col].fillna(total_demographic[col].mean())

In [None]:
#check no more null values
total_demographic.isna().sum().sum()==0

In [None]:
#check duplicated rows
total_demographic.duplicated().sum()>0

In [None]:
#check duplicated columns
total_demographic.T.duplicated().sum()>0

We have information for each parish, however we need data for each subcounty. We will use population as a proxy to calculate the weight of each parish in a subcounty. We then calculate a weighted average of parishes to get subcounty values.

In [None]:
#calculate population for each subcounty
tot_pop_subcounty=total_demographic[['subcounty','tot_pop_parish']].groupby('subcounty').sum()

#rename column
tot_pop_subcounty.columns = ['tot_pop_subcounty']

#check
tot_pop_subcounty

In [None]:
#add the subounty population to our main dataframe
total_demographic=total_demographic.merge(tot_pop_subcounty, how='left', on='subcounty')

#check
total_demographic.head()

In [None]:
#calculate weight of parish in each subcounty
total_demographic['weight']=total_demographic['tot_pop_parish']/total_demographic['tot_pop_subcounty']

#check all weights equal to one
total_demographic[['subcounty','weight']].groupby('subcounty').sum().sum()==len(total_demographic[['subcounty']].groupby('subcounty'))

In [None]:
#multiply all values by weight of the parish and update current dataframe
total_demographic.iloc[:,2:-1]=total_demographic.iloc[:,2:-1].multiply(total_demographic['weight'], axis='index')

#check
total_demographic.head()

In [None]:
#collapse by subcounty, adding up all the weighted values, giving us a weighted average
subcounty_demographic=total_demographic.groupby('subcounty').sum()

#check that no column is more than 100 (except for population)
subcounty_demographic.describe()

In [None]:
#making subcounty a column
subcounty_demographic.reset_index(inplace=True)

#check
subcounty_demographic.head()

In [None]:
#drop parish and weight columns as now irrelevant
subcounty_demographic_clean=pd.DataFrame(subcounty_demographic.drop(columns=['weight', 'tot_pop_parish']))

#check current columns
subcounty_demographic_clean.info()

In [None]:
#check column datatypes are in correct format
subcounty_demographic_clean.info()

In [None]:
#rechecking null values and duplicated columns
print(subcounty_demographic_clean.isna().sum().sum()>0,\
      subcounty_demographic_clean.duplicated().sum()>0,\
      subcounty_demographic_clean.T.duplicated().sum()>0)

In [None]:
#replacing column name to match other datasets
subcounty_demographic_clean.rename(columns={'subcounty': 'clean_adm4'}, inplace=True)

In [None]:
#export to cleaned dataset to csv
subcounty_demographic_clean.to_csv('/Users/thomasadler/Desktop/Capstone_Docs/subcounty_demographic_clean.csv')