Data Science with the MalariaAfricaDataset

In [1]:
# Importing all the Necessary Libraries that are to be used in the Data Cleaning and Processing

import numpy as np  # numpy for numerical computing in python
import pandas as pd  # pandas for data manipulation and analysis
import matplotlib.pyplot as plt  # matplotlib for popular plotting(creatinng static,iiteractive and animated visualisations
import seaborn as sns  # seaborn provides the interface for creating attractive statistical graphics
import sklearn
import scipy
import statsmodels.api as sm
import datetime as dt
import plotly.express as px

In [2]:
# Control display of the data
pd. options.display.max_columns=50
sns.set(style="darkgrid", rc={"axes.facecolor":"CAF1DE"})

In [3]:
# Displaying the dataset
df=pd.read_csv('MalariaAfricaDataset.csv')
print(df)

     Country Name  Year Country Code  \
0         Algeria  2007          DZA   
1          Angola  2007          AGO   
2           Benin  2007          BEN   
3        Botswana  2007          BWA   
4    Burkina Faso  2007          BFA   
..            ...   ...          ...   
589          Togo  2017          TGO   
590       Tunisia  2017          TUN   
591        Uganda  2017          UGA   
592        Zambia  2017          ZMB   
593      Zimbabwe  2017          ZWE   

     Incidence of malaria (per 1,000 population at risk)  \
0                                                 0.01     
1                                               286.72     
2                                               480.24     
3                                                 1.03     
4                                               503.80     
..                                                 ...     
589                                             278.20     
590                                    

In [5]:
 # Understanding the data structure by checking the shape of the data
df.shape

(594, 27)

In [6]:
# Chhecking for the Column names of the dataset
df.columns

Index(['Country Name', 'Year', 'Country Code',
       'Incidence of malaria (per 1,000 population at risk)',
       'Malaria cases reported',
       'Use of insecticide-treated bed nets (% of under-5 population)',
       'Children with fever receiving antimalarial drugs (% of children under age 5 with fever)',
       'Intermittent preventive treatment (IPT) of malaria in pregnancy (% of pregnant women)',
       'People using safely managed drinking water services (% of population)',
       'People using safely managed drinking water services, rural (% of rural population)',
       'People using safely managed drinking water services, urban (% of urban population)',
       'People using safely managed sanitation services (% of population)',
       'People using safely managed sanitation services, rural (% of rural population)',
       'People using safely managed sanitation services, urban  (% of urban population)',
       'Rural population (% of total population)',
       'Rural popula

In [8]:
# Removing the whitespaces in the column names if any
df.columns = df.columns.to_series().apply(lambda x: x.strip())

In [9]:
# To check if the whitespaces have been removed
df.columns

Index(['Country Name', 'Year', 'Country Code',
       'Incidence of malaria (per 1,000 population at risk)',
       'Malaria cases reported',
       'Use of insecticide-treated bed nets (% of under-5 population)',
       'Children with fever receiving antimalarial drugs (% of children under age 5 with fever)',
       'Intermittent preventive treatment (IPT) of malaria in pregnancy (% of pregnant women)',
       'People using safely managed drinking water services (% of population)',
       'People using safely managed drinking water services, rural (% of rural population)',
       'People using safely managed drinking water services, urban (% of urban population)',
       'People using safely managed sanitation services (% of population)',
       'People using safely managed sanitation services, rural (% of rural population)',
       'People using safely managed sanitation services, urban  (% of urban population)',
       'Rural population (% of total population)',
       'Rural popula

In [10]:
# Checking the basic infomation of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594 entries, 0 to 593
Data columns (total 27 columns):
 #   Column                                                                                   Non-Null Count  Dtype  
---  ------                                                                                   --------------  -----  
 0   Country Name                                                                             594 non-null    object 
 1   Year                                                                                     594 non-null    int64  
 2   Country Code                                                                             594 non-null    object 
 3   Incidence of malaria (per 1,000 population at risk)                                      550 non-null    float64
 4   Malaria cases reported                                                                   550 non-null    float64
 5   Use of insecticide-treated bed nets (% of under-5 population)   

In [11]:
# Starting the data cleaning by checking and removing of duplicates if any
for isduplicate in df.duplicated().array:
    
    if isduplicate:
        print("Duplicates found and dropped")
        df.drop_duplicates(Inplace = True)
        break
    else:
        pass

print("No Duplicates Found")     
df[df.duplicated()==True]
        

No Duplicates Found


Unnamed: 0,Country Name,Year,Country Code,"Incidence of malaria (per 1,000 population at risk)",Malaria cases reported,Use of insecticide-treated bed nets (% of under-5 population),Children with fever receiving antimalarial drugs (% of children under age 5 with fever),Intermittent preventive treatment (IPT) of malaria in pregnancy (% of pregnant women),People using safely managed drinking water services (% of population),"People using safely managed drinking water services, rural (% of rural population)","People using safely managed drinking water services, urban (% of urban population)",People using safely managed sanitation services (% of population),"People using safely managed sanitation services, rural (% of rural population)","People using safely managed sanitation services, urban (% of urban population)",Rural population (% of total population),Rural population growth (annual %),Urban population (% of total population),Urban population growth (annual %),People using at least basic drinking water services (% of population),"People using at least basic drinking water services, rural (% of rural population)","People using at least basic drinking water services, urban (% of urban population)",People using at least basic sanitation services (% of population),"People using at least basic sanitation services, rural (% of rural population)","People using at least basic sanitation services, urban (% of urban population)",latitude,longitude,geometry


In [12]:
# Checking the Null or NaN values
df.isnull().sum().sort_values(ascending=False)

People using safely managed drinking water services, rural (% of rural population)         506
People using safely managed drinking water services (% of population)                      495
Intermittent preventive treatment (IPT) of malaria in pregnancy (% of pregnant women)      488
People using safely managed sanitation services, rural (% of rural population)             484
Children with fever receiving antimalarial drugs (% of children under age 5 with fever)    472
People using safely managed sanitation services, urban  (% of urban population)            462
People using safely managed sanitation services (% of population)                          462
Use of insecticide-treated bed nets (% of under-5 population)                              462
People using safely managed drinking water services, urban (% of urban population)         418
Malaria cases reported                                                                      44
Incidence of malaria (per 1,000 population at risk

In [13]:
# this is Optional (checking for null values in a percentage form)
for i in df.columns:
    if df[i].isnull().any():
        print(i, "......", df[i].isnull().sum()*100/df.shape[0],"%")

Incidence of malaria (per 1,000 population at risk) ...... 7.407407407407407 %
Malaria cases reported ...... 7.407407407407407 %
Use of insecticide-treated bed nets (% of under-5 population) ...... 77.77777777777777 %
Children with fever receiving antimalarial drugs (% of children under age 5 with fever) ...... 79.46127946127946 %
Intermittent preventive treatment (IPT) of malaria in pregnancy (% of pregnant women) ...... 82.15488215488216 %
People using safely managed drinking water services (% of population) ...... 83.33333333333333 %
People using safely managed drinking water services, rural (% of rural population) ...... 85.18518518518519 %
People using safely managed drinking water services, urban (% of urban population) ...... 70.37037037037037 %
People using safely managed sanitation services (% of population) ...... 77.77777777777777 %
People using safely managed sanitation services, rural (% of rural population) ...... 81.48148148148148 %
People using safely managed sanitation

In [19]:
# ploting a graph for the missing data
import missingno as msno
msno.bar(df, color = "gray", fontsize=26);

ModuleNotFoundError: No module named 'missingno'

In [18]:
# checking of rows that have null columns
withnullrows = df[df.isna().any(axis=1)]
beforeComputation = len(withnullrows.axes[0])
print("Number of rows with null values: "+str(beforeComputation))

Number of rows with null values: 585


In [None]:
# number of Malaria Cases
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color:pink' if cell else '' for cell in is_max] 
            
def highlight_min(s):
    is_min = s == s.min()
    return ['background-color:green' if cell else '' for cell in is_max] 

number_of_cases = 