
# Data Exploration and Cleaning – Maji Ndogo Water Services

This notebook documents my process of exploring and analyzing the `md_water_services` database.  
The goal is to answer questions, identify patterns, and clean inconsistencies in the data.  
The work is divided into five sections, each addressing a specific aspect of the dataset.


In [1]:
%load_ext sql
%sql mysql+pymysql://root:Dsk264501@localhost:3306/md_water_servicesb
%config SqlMagic.style = '_DEPRECATED_DEFAULT'


### Section 1 – Getting to Know the Data

Before beginning analysis, it’s important to understand the structure of the database.  
We start by checking the available tables, then previewing each to get familiar with their contents and how they relate to each other.


In [2]:
%%sql
-- This query returns the list of tables in the md_water_services database
SHOW TABLES;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
8 rows affected.


Tables_in_md_water_servicesb
data_dictionary
employee
global_water_access
location
visits
water_quality
water_source
well_pollution


In [3]:
%%sql
-- Exploring the employee table
/* Information about employees, including contact details, location, and position */
SELECT *
FROM employee
LIMIT 10;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
10 rows affected.


assigned_employee_id,employee_name,phone_number,email,address,province_name,town_name,position
0,Amara Jengo,99637993287,,36 Pwani Mchangani Road,Sokoto,Ilanga,Field Surveyor
1,Bello Azibo,99643864786,,129 Ziwa La Kioo Road,Kilimani,Rural,Field Surveyor
2,Bakari Iniko,99222599041,,18 Mlima Tazama Avenue,Hawassa,Rural,Field Surveyor
3,Malachi Mavuso,99945849900,,100 Mogadishu Road,Akatsi,Lusaka,Field Surveyor
4,Cheche Buhle,99381679640,,1 Savanna Street,Akatsi,Rural,Field Surveyor
5,Zuriel Matembo,99034075111,,26 Bahari Ya Faraja Road,Kilimani,Rural,Field Surveyor
6,Deka Osumare,99379364631,,104 Kenyatta Street,Akatsi,Rural,Field Surveyor
7,Lalitha Kaburi,99681623240,,145 Sungura Amanpour Road,Kilimani,Rural,Field Surveyor
8,Enitan Zuri,99248509202,,117 Kampala Road,Hawassa,Zanzibar,Field Surveyor
10,Farai Nia,99570082739,,33 Angélique Kidjo Avenue,Amanzi,Dahabu,Field Surveyor


In [4]:
%%sql
-- Exploring the location table
/* Location-specific details: address, province, town, and urban/rural classification */
SELECT *
FROM location
LIMIT 10;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
10 rows affected.


location_id,address,province_name,town_name,location_type
AkHa00000,2 Addis Ababa Road,Akatsi,Harare,Urban
AkHa00001,10 Addis Ababa Road,Akatsi,Harare,Urban
AkHa00002,9 Addis Ababa Road,Akatsi,Harare,Urban
AkHa00003,139 Addis Ababa Road,Akatsi,Harare,Urban
AkHa00004,17 Addis Ababa Road,Akatsi,Harare,Urban
AkHa00005,125 Addis Ababa Road,Akatsi,Harare,Urban
AkHa00006,98 Addis Ababa Road,Akatsi,Harare,Urban
AkHa00007,21 Addis Ababa Road,Akatsi,Harare,Urban
AkHa00008,11 Addis Ababa Road,Akatsi,Harare,Urban
AkHa00009,6 Addis Ababa Road,Akatsi,Harare,Urban


In [5]:
%%sql
-- Exploring the visits table
/* Visit records: employee ID, source ID, date, visit count, and queue time */
SELECT *
FROM visits
LIMIT 10;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
10 rows affected.


record_id,location_id,source_id,time_of_record,visit_count,time_in_queue,assigned_employee_id
0,SoIl32582,SoIl32582224,2021-01-01 09:10:00,1,15,12
1,KiRu28935,KiRu28935224,2021-01-01 09:17:00,1,0,46
2,HaRu19752,HaRu19752224,2021-01-01 09:36:00,1,62,40
3,AkLu01628,AkLu01628224,2021-01-01 09:53:00,1,0,1
4,AkRu03357,AkRu03357224,2021-01-01 10:11:00,1,28,14
5,KiRu29315,KiRu29315224,2021-01-01 10:17:00,1,9,40
6,AkRu05234,AkRu05234224,2021-01-01 10:18:00,1,0,30
7,KiRu28520,KiRu28520224,2021-01-01 10:28:00,1,0,34
8,HaZa21742,HaZa21742224,2021-01-01 10:37:00,1,0,6
9,AmDa12214,AmDa12214224,2021-01-01 10:58:00,1,0,36


In [6]:
%%sql
-- Exploring the water_quality table
/* Subjective water quality scores and related visit counts */
SELECT *
FROM water_quality
LIMIT 10;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
10 rows affected.


record_id,subjective_quality_score,visit_count
0,0,1
1,1,1
2,5,1
3,10,1
4,4,1
5,0,1
6,9,1
7,10,1
8,2,1
9,10,1


In [7]:
%%sql
-- Exploring the water_source table
/* Details about water sources: type, location, and population served */
SELECT *
FROM water_source
LIMIT 10;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
10 rows affected.


source_id,type_of_water_source,number_of_people_served
AkHa00000224,tap_in_home,956
AkHa00001224,tap_in_home_broken,930
AkHa00002224,tap_in_home_broken,486
AkHa00003224,well,364
AkHa00004224,tap_in_home_broken,942
AkHa00005224,tap_in_home,736
AkHa00006224,tap_in_home,882
AkHa00007224,tap_in_home,554
AkHa00008224,well,398
AkHa00009224,well,346


In [8]:
%%sql
-- Exploring the well_pollution table
/* Pollution test results, including biological contamination */
SELECT *
FROM well_pollution
LIMIT 10;


 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
10 rows affected.


source_id,date,description,pollutant_ppm,biological,results
KiRu28935224,2021-01-04 09:17:00,Bacteria: Giardia Lamblia,0.0,495.898,Contaminated: Biological
AkLu01628224,2021-01-04 09:53:00,Bacteria: E. coli,0.0,6.09608,Contaminated: Biological
HaZa21742224,2021-01-04 10:37:00,"Inorganic contaminants: Zinc, Zinc, Lead, Cadmium",2.715,0.0,Contaminated: Chemical
HaRu19725224,2021-01-04 11:04:00,Clean,0.0288593,9.56996e-05,Clean
SoRu35703224,2021-01-04 11:29:00,Bacteria: E. coli,0.0,22.5009,Contaminated: Biological
AkHa00070224,2021-01-04 11:42:00,Inorganic contaminants: Cadmium,5.46739,0.0,Contaminated: Chemical
HaSe21346224,2021-01-04 11:52:00,Clean,0.0140376,8.98989e-05,Clean
HaYa21468224,2021-01-04 12:03:00,"Inorganic contaminants: Chromium, Barium, Chromium, Lead",6.05137,0.0,Contaminated: Chemical
SoRu36278224,2021-01-04 12:24:00,Parasite: Cryptosporidium,0.0,485.162,Contaminated: Biological
AkLu02155224,2021-01-04 12:29:00,"Inorganic contaminants: Selenium, Arsenic",7.64106,0.0,Contaminated: Chemical



### Section 2 – Diving into Water Sources

In this section, we focus on understanding the different types of water sources in the database.  
This helps identify the variety of water access points in the region.


In [9]:
%%sql
/* This query returns the unique water sources in the water_source table */
SELECT DISTINCT type_of_water_source
FROM water_source;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
5 rows affected.


type_of_water_source
tap_in_home
tap_in_home_broken
well
shared_tap
river



### Section 3 – Unpacking the Visits to Water Sources

Here we explore patterns in visit times, identifying cases where people spend unusually long periods to fetch water.


In [10]:
%%sql
/* Visits where the estimated queue time is over 500 minutes (~8 hours) */
SELECT * 
FROM visits
WHERE time_in_queue > 500
LIMIT 10;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
10 rows affected.


record_id,location_id,source_id,time_of_record,visit_count,time_in_queue,assigned_employee_id
899,SoRu35083,SoRu35083224,2021-01-16 10:14:00,6,515,28
2304,SoKo33124,SoKo33124224,2021-02-06 07:53:00,5,512,16
2315,KiRu26095,KiRu26095224,2021-02-06 14:32:00,3,529,8
3206,SoRu38776,SoRu38776224,2021-02-20 15:03:00,5,509,46
3701,HaRu19601,HaRu19601224,2021-02-27 12:53:00,3,504,0
4154,SoRu38869,SoRu38869224,2021-03-06 10:44:00,2,533,24
5483,AmRu14089,AmRu14089224,2021-03-27 18:15:00,4,509,12
9177,SoRu37635,SoRu37635224,2021-05-22 18:48:00,2,515,1
9648,SoRu36096,SoRu36096224,2021-05-29 11:24:00,2,533,3
11631,AkKi00881,AkKi00881224,2021-06-26 06:15:00,6,502,32


In [15]:
%%sql
/* Types of water sources with ~8 hour wait times */
SELECT 
    ws.source_id,
    type_of_water_source,
    number_of_people_served,
    time_in_queue
FROM water_source AS ws
INNER JOIN visits AS vs
ON ws.source_id = vs.source_id
WHERE vs.time_in_queue > 500
LIMIT 20;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
20 rows affected.


source_id,type_of_water_source,number_of_people_served,time_in_queue
SoRu35083224,shared_tap,3058,515
SoKo33124224,shared_tap,3052,512
KiRu26095224,shared_tap,3734,529
SoRu38776224,shared_tap,3180,509
HaRu19601224,shared_tap,3322,504
SoRu38869224,shared_tap,3984,533
AmRu14089224,shared_tap,3458,509
SoRu37635224,shared_tap,3920,515
SoRu36096224,shared_tap,3786,533
AkKi00881224,shared_tap,3398,502


In [12]:
%%sql
-- Further investigation of the specific water sources
SELECT 
    source_id,
    type_of_water_source,
    number_of_people_served
FROM water_source 
WHERE source_id IN ('AkKi00881224', 'AkLu01628224', 'AkRu05234224',
                    'HaRu19601224', 'HaZa21742224', 'SoRu36096224',
                    'SoRu37635224', 'SoRu38776224');

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
8 rows affected.


source_id,type_of_water_source,number_of_people_served
AkKi00881224,shared_tap,3398
AkLu01628224,well,210
AkRu05234224,tap_in_home_broken,496
HaRu19601224,shared_tap,3322
HaZa21742224,well,308
SoRu36096224,shared_tap,3786
SoRu37635224,shared_tap,3920
SoRu38776224,shared_tap,3180



### Section 4 – Assessing the Quality of Water Sources

In this section, we identify high-quality water sources based on subjective quality scores and visit counts.


In [16]:
%%sql
SELECT *
FROM water_quality
WHERE visit_count >= 2 AND subjective_quality_score = 10
LIMIT 20;

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
20 rows affected.


record_id,subjective_quality_score,visit_count
59,10,2
67,10,3
85,10,4
128,10,5
137,10,2
232,10,3
263,10,6
269,10,2
271,10,3
317,10,4



### Section 5 – Investigating Pollution Sources and Cleaning Data

Finally, we investigate pollution levels, detect inconsistencies in descriptions,  
and correct errors in the dataset to ensure data quality for further analysis.


In [17]:
%%sql
SELECT *
FROM well_pollution
WHERE results = "Clean" AND biological > 0.01
LIMIT 10;


 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
10 rows affected.


source_id,date,description,pollutant_ppm,biological,results
AkRu08936224,2021-01-08 09:22:00,Bacteria: E. coli,0.0406458,35.0068,Clean
AkRu06489224,2021-01-10 09:44:00,Clean Bacteria: Giardia Lamblia,0.0897904,38.467,Clean
SoRu38011224,2021-01-14 15:35:00,Bacteria: E. coli,0.0425095,19.2897,Clean
AkKi00955224,2021-01-22 12:47:00,Bacteria: E. coli,0.0812092,40.2273,Clean
KiHa22929224,2021-02-06 13:54:00,Bacteria: E. coli,0.0722537,18.4482,Clean
KiRu25473224,2021-02-07 15:51:00,Clean Bacteria: Giardia Lamblia,0.0630094,24.4536,Clean
HaRu17401224,2021-03-01 13:44:00,Clean Bacteria: Giardia Lamblia,0.0649209,25.8129,Clean
AkRu07137224,2021-03-04 13:41:00,Clean Bacteria: Giardia Lamblia,0.0656843,18.2978,Clean
KiRu27205224,2021-03-13 14:17:00,Clean Bacteria: Giardia Lamblia,0.0418018,49.4281,Clean
AkLu02307224,2021-03-13 15:41:00,Bacteria: E. coli,0.0709682,35.203,Clean


In [18]:
%%sql
-- Checking inconsistent descriptions
SELECT description
FROM well_pollution
WHERE description LIKE 'Clean_%';

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
38 rows affected.


description
Clean Bacteria: Giardia Lamblia
Clean Bacteria: Giardia Lamblia
Clean Bacteria: Giardia Lamblia
Clean Bacteria: Giardia Lamblia
Clean Bacteria: Giardia Lamblia
Clean Bacteria: Giardia Lamblia
Clean Bacteria: Giardia Lamblia
Clean Bacteria: Giardia Lamblia
Clean Bacteria: Giardia Lamblia
Clean Bacteria: Giardia Lamblia


In [21]:
%%sql
-- Correcting 'Bacteria: E. coli'
UPDATE 
    well_pollution
SET 
    description = 'Bacteria: E. coli'
WHERE 
    description = 'Clean Bacteria: E. coli';

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
0 rows affected.


[]

In [22]:
%%sql
-- Correcting 'Bacteria: Giardia Lamblia'
UPDATE 
    well_pollution
SET 
    description = 'Bacteria: Giardia Lamblia'
WHERE 
    description = 'Clean Bacteria: Giardia Lamblia';

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
0 rows affected.


[]

In [23]:
%%sql
-- Correcting results where biological contamination exists
UPDATE 
    well_pollution
SET
    results =  'Contaminated: Biological'   
WHERE 
    biological > 0.01 AND results = 'Clean';

 * mysql+pymysql://root:***@localhost:3306/md_water_servicesb
64 rows affected.


[]