# Master Dataframe

We will be combining our water, conflict and demographic data into one master dataframe to be readily used for analysis and modelling.

In [4]:
#importing relevant packages
%run /Users/thomasadler/Desktop/futuristic-platipus/notebooks/ta_01_packages.py

In [5]:
# import useful functions
%run /Users/thomasadler/Desktop/futuristic-platipus/notebooks/ta_02_functions.py

In [6]:
#defining working directory
filepath = '/Users/thomasadler/Desktop/capstone_docs/'

## Loading datasets

In [8]:
#water points
water_df=pd.read_csv(filepath + 'uganda_water_df_clean.csv')

#check
water_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108257 entries, 0 to 108256
Data columns (total 36 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 108257 non-null  int64  
 1   row_id                     108257 non-null  int64  
 2   source                     108257 non-null  object 
 3   lat_deg                    108257 non-null  float64
 4   lon_deg                    108257 non-null  float64
 5   report_date                108257 non-null  object 
 6   status_id                  108257 non-null  int64  
 7   facility_type              108257 non-null  object 
 8   clean_country_name         108257 non-null  object 
 9   clean_adm1                 108257 non-null  object 
 10  clean_adm2                 108257 non-null  object 
 11  clean_adm3                 108257 non-null  object 
 12  clean_adm4                 108257 non-null  object 
 13  distance_to_primary        10

In [10]:
#conflict events
conflict_df=pd.read_csv(filepath +'uganda_conflict_df_clean.csv')

#check
conflict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7793 entries, 0 to 7792
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      7793 non-null   int64  
 1   data_id         7793 non-null   int64  
 2   event_date      7793 non-null   object 
 3   event_type      7793 non-null   object 
 4   sub_event_type  7793 non-null   object 
 5   actor1          7793 non-null   object 
 6   assoc_actor_1   1920 non-null   object 
 7   inter1          7793 non-null   int64  
 8   actor2          6551 non-null   object 
 9   assoc_actor_2   1647 non-null   object 
 10  inter2          7793 non-null   int64  
 11  interaction     7793 non-null   int64  
 12  clean_adm1      7793 non-null   object 
 13  clean_adm2      7793 non-null   object 
 14  clean_adm3      7793 non-null   object 
 15  clean_adm4      7793 non-null   object 
 16  latitude        7793 non-null   float64
 17  longitude       7793 non-null   f

In [9]:
#demographic variables
demographic_df=pd.read_csv(filepath +'subcounty_demographic_clean.csv')

#check
demographic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1382 entries, 0 to 1381
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   1382 non-null   int64  
 1   clean_adm4                   1382 non-null   object 
 2   perc_hh_head_male            1382 non-null   float64
 3   perc_pop612_primary          1382 non-null   float64
 4   perc_pop1318_secondary       1382 non-null   float64
 5   perc_pop18p_illiterate       1382 non-null   float64
 6   perc_pop017_certificate      1382 non-null   float64
 7   perc_pop017_both_parents     1382 non-null   float64
 8   perc_pop2p_disability        1382 non-null   float64
 9   perc_pop1017_married         1382 non-null   float64
 10  perc_pop1217_birth           1382 non-null   float64
 11  perc_pop1464_working         1382 non-null   float64
 12  perc_pop10p_mobile_phone     1382 non-null   float64
 13  perc_hh_temp_dwell

## Merge datasets

We want to get demographic information for each water point at its most local level.

In [17]:
#merge water dataset with demographic dataset
master_df=pd.merge(water_df, demographic_df, how='left',\
                               left_on=water_df['clean_adm4'],\
                               right_on=demographic_df['clean_adm4'])

#check
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108257 entries, 0 to 108256
Data columns (total 66 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   key_0                        108257 non-null  object 
 1   Unnamed: 0_x                 108257 non-null  int64  
 2   row_id                       108257 non-null  int64  
 3   source                       108257 non-null  object 
 4   lat_deg                      108257 non-null  float64
 5   lon_deg                      108257 non-null  float64
 6   report_date                  108257 non-null  object 
 7   status_id                    108257 non-null  int64  
 8   facility_type                108257 non-null  object 
 9   clean_country_name           108257 non-null  object 
 10  clean_adm1                   108257 non-null  object 
 11  clean_adm2                   108257 non-null  object 
 12  clean_adm3                   108257 non-null  object 
 13 

In [28]:
# convert to datetime
date_converter(master_df, 'report_date')

#create new columns for month and year
master_df['report_year']=master_df['report_date'].dt.year
master_df['report_month']=master_df['report_date'].dt.month
master_df['report_year_month']= master_df['report_year'].map(str) + '-' + master_df['report_month'].map(str)

# check
master_df.head()

Unnamed: 0,key_0,Unnamed: 0_x,row_id,source,lat_deg,lon_deg,report_date,status_id,facility_type,clean_country_name,...,perc_hh_own_house,perc_hh_own_tv,perc_hh_bank_acc,perc_hh_subs_farm,perc_hh_less2meals,perc_hh_electricity,tot_pop_subcounty,report_year,report_month,report_year_month
0,Kabambiro,0,651231,Water For People,0.158537,30.490643,2005-09-07,1,Improved,Uganda,...,87.942373,1.249238,10.259377,91.358551,4.212975,4.461754,15484.0,2005,9,2005-9
1,Nyabbani,1,652323,Water For People,0.070597,30.415651,2005-09-07,1,Improved,Uganda,...,88.482012,2.614604,14.573029,92.319897,4.458575,7.565426,21953.0,2005,9,2005-9
2,Kabambiro,2,654375,Water For People,0.158667,30.490551,2005-09-07,1,Improved,Uganda,...,87.942373,1.249238,10.259377,91.358551,4.212975,4.461754,15484.0,2005,9,2005-9
3,Kamwenge,3,654822,Water For People,0.208128,30.386393,2005-09-07,1,Improved,Uganda,...,87.243217,2.055043,12.522317,91.41194,3.893017,5.780841,22010.0,2005,9,2005-9
4,Kicheche,4,660256,Water For People,-0.13468,30.351593,2005-09-07,1,Improved,Uganda,...,89.721699,4.054211,20.201299,87.522511,4.27149,9.606041,26338.0,2005,9,2005-9


In [29]:
#for each adm4/year/month, get number of fatalities/events in last year, last 5/10/20

In [None]:
#merge master and conflict

## Clean data

In [None]:
#duplicates 2x, null, types, column names, drop columns

## EDA

In [None]:
#outcome is water functioning

In [None]:
#data dictionary