# Exploratory Data Analysis

In [None]:
#importing relevant packages
%run /Users/thomasadler/Desktop/futuristic-platipus/notebooks/ta_01_packages.py

In [None]:
# import useful functions
%run /Users/thomasadler/Desktop/futuristic-platipus/notebooks/ta_02_functions.py

In [None]:
#defining working directory
filepath = '/Users/thomasadler/Desktop/capstone_docs/'

In [None]:
#data dictionary part A
Image("/Users/thomasadler/Desktop/futuristic-platipus/data_dictionary/4A-Master-Dictionary.png")

In [None]:
#data dictionary part B
Image("/Users/thomasadler/Desktop/futuristic-platipus/data_dictionary/4B-Master-Dictionary.png")

Now we have a clean dataset, with no duplicate rows/columns, no missing values, all of our columns of interest and all of them in a format fit for analysis. Our outcome (dependent) variable is whether a water point is functioning or not. `is_functioning` is a binary column equal to 1 if that water point was functioning at the time of check, 0 if not.

In [None]:
#water points
master_df_raw=pd.read_csv(filepath + 'master_df.csv')

#leaving raw dataset untouched

master_df=master_df_raw.copy()

#check
master_df.info()

In [None]:
#summary statistics
round(master_df.describe().T)

We will go through every column, and understand the information it contains and how we can use it in our models.

# 1. wpdx_id

In [None]:
#unique water points
unique_water=len(set(master_df['wpdx_id']))
total_observations=len(master_df['wpdx_id'])

print(f"There are {unique_water} unique water points in the dataset.")
print(f"There are {total_observations} reports in the dataset.")
print(f"There are {total_observations-unique_water} water points with more than one report.")

In [None]:
#reports by water point
reports_water_pt=master_df[['wpdx_id','clean_adm1' ]].groupby('wpdx_id').count()

#visualise
sns.histplot(reports_water_pt, legend=False)
plt.title("Majority of water points have only been checked once")
plt.xlabel("Number of reports")
plt.show()

# 2-3. lat_deg & lon_deg

In [None]:
#location of all water points
unique_water_points=master_df.groupby('wpdx_id').mean()

In [None]:
# #visualise water points, choose what variable represents the size of the points
# fig = px.scatter_geo(
#     water_points,
#     lon='lon_deg', lat='lat_deg', 
#     size='served_population', #'crucialness', 'pressure', 'total_fatalities_adm4', 'total_events_adm4 
#     height=600,
#     width=800,
# )

# fig.show()

# 4. is_functioning

In [None]:
#functioning water points
master_df['is_functioning'].value_counts(normalize=True)*100

In [None]:
#list of regional level
regions=['clean_adm1', 'clean_adm2', 'clean_adm3', 'clean_adm4']

#visualise through a subplot
plt.subplots(2,2, figsize=(30,20))

for i, adm in enumerate(regions, 1):
    adm_functioning=master_df[[adm,'is_functioning']].groupby(adm).mean()*100
    plt.subplot(3,2,i)
    sns.histplot(adm_functioning, legend=False)
    plt.xlabel(f"Proportion of water points functioning in {adm}", size=25)
    sns.set(font_scale = 2)
    plt.axvline(adm_functioning['is_functioning'].median(), c='gold', label='median')    
plt.tight_layout()
plt.show()
    


# 5-8. clean_adm

In [None]:
#number of regions
for regions in ['clean_adm1', 'clean_adm2', 'clean_adm3', 'clean_adm4']:   
    print(f"There are {len(set(master_df[regions]))} {regions} regions in our Uganda dataset")

In [None]:
#number of water point reports by region
adm1_reports=master_df[['clean_adm1', 'wpdx_id']].groupby('clean_adm1').count()

#visualise
sns.barplot(data=adm1_reports, y=adm1_reports.index, x=adm1_reports['wpdx_id'], palette="flare")
plt.axvline(adm1_reports['wpdx_id'].mean(), c='royalblue', label='mean')
plt.title('Reports by adm1 region')
plt.show()


In [None]:
#list of regional level
regions=['clean_adm2', 'clean_adm3', 'clean_adm4']

#visualise through a subplot
plt.subplots(2,2, figsize=(30,20))

for i, adm in enumerate(regions, 1):
    adm_reports=master_df[[adm, 'wpdx_id']].groupby(adm).count()
    plt.subplot(3,1,i)
    sns.histplot(adm_reports, legend=False)
    plt.xlabel(f"Number of reports by {adm}")
    sns.set(font_scale = 2)
    plt.axvline(adm_reports['wpdx_id'].median(), c='gold', label='median')  
    plt.axvline(adm_reports['wpdx_id'].mean(), c='r', label='mean')    
plt.tight_layout()
plt.show()

# 9-13. distance_to...

In [None]:
#visualise distances for water points
distances=['distance_to_primary', 'distance_to_secondary', 'distance_to_tertiary', 'distance_to_city']

#creating subplot
plt.subplots(2,2, figsize=(30,20))

for i, distance in enumerate(distances, 1):
    sns.histplot(unique_water_points[distance], legend=False)
    plt.subplot(2,2,i)
    sns.histplot(adm_reports, legend=False)
    plt.xlabel(f"{distance} for a water point")
    sns.set(font_scale = 2)
    plt.axvline(unique_water_points[distance].median(), c='gold', label='median')  
    plt.axvline(unique_water_points[distance].mean(), c='r', label='mean')    
plt.tight_layout()
plt.show()

# 14. usage_cap

In [None]:
#visualise
sns.distplot(unique_water_points['usage_cap'])
plt.show()

#usage capacity
(unique_water_points['usage_cap'].value_counts(normalize=True)*100).head()

# 15. staleness_score

In [None]:
#visualise
sns.distplot(unique_water_points['staleness_score'])
plt.xlim(10,)
plt.axvline(unique_water_points['staleness_score'].mean(), c='r', label='mean')
plt.show()

# 24-45. Demographics and Regional statistics

In [None]:
#visualise variables by adm1
adm1_df=master_df.groupby("clean_adm1").mean()

In [None]:
#all variables by adm1 regional level
for variable in adm1_df.columns:
    sns.barplot(data=adm1_df, y=adm1_df.index, x=adm1_df[variable], palette="Blues_d")
    plt.xlabel(f"average {variable}")
    sns.set(font_scale = 1)
    plt.axvline(adm1_df[variable].mean(), c='r', label='mean')    
    plt.show()



extra is functioning with others..
distribution of every variable
relationships
all feature selection steps