# Part 1: Data Management & Programming
### This project presents the visualisations about malaria starting with the data sets from Github repo: https://github.com/rfordatascience/tidytuesday/tree/master/data/2018/2018-11-13.

In [179]:
# Import libraries
import random
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [180]:
# Set random seed
# random.seed(2024)

# Import dataset
path = "..."
malaria_deaths = pd.read_csv(path + "malaria_deaths.csv")
malaria_inc = pd.read_csv(path + "malaria_inc.csv")
malaria_deaths_age = pd.read_csv(path + "malaria_deaths_age.csv", index_col=0)

### Data Preparation

In [181]:
"""
Create a boxplot and histogram for specific columns for data preparation

Parameters:
- data: DataFrame
- col: The column to create the plots
- bins: Numbr of bins for the histogram

"""
def plot_box_and_hist(data, col, bins):
    # Create a figure with side by side plots
    fig, (ax_box, ax_hist) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

    # Create histogram
    sns.histplot(data[col], bins=bins, ax=ax_hist)
    ax_hist.set_title(f'Histogram for Number of {col}')

    # Create boxplot
    sns.boxplot(data[col], ax=ax_box)
    ax_box.set_title(f'Boxplot for Number of {col}')

    # Display plots
    plt.tight_layout()
    plt.show()


##### Malaria Deaths

In [None]:
# Rename column 
malaria_deaths = malaria_deaths.rename(columns={"Deaths - Malaria - Sex: Both - Age: Age-standardized (Rate) (per 100,000 people)":"Deaths"})

# Remove duplicates
malaria_deaths.drop_duplicates()

# Summary
## Show information of the dataset
malaria_deaths.info()

## Show the statistics of the dataset
malaria_deaths['Deaths'].describe()

In [None]:
# View the distribution of Malaria deaths dataset
plot_box_and_hist(malaria_deaths, 'Deaths', 20)

##### Malaria Incidents

In [None]:
# Rename column 
malaria_inc = malaria_inc.rename(columns={"Incidence of malaria (per 1,000 population at risk) (per 1,000 population at risk)":"Incidences"})

# Remove duplicates
malaria_inc.drop_duplicates()

# Summary
## Show information of the dataset
malaria_inc.info()

## Show the statistics of the dataset
malaria_inc['Incidences'].describe()


In [None]:
# View the distribution of the Malaria incidences dataset
plot_box_and_hist(malaria_inc, 'Incidences', bins=10)

In [186]:
# Replace outliers with median values

## Identify outliers
# Calulate IQR
Q1 = malaria_inc['Incidences'].quantile(0.25)
Q3 = malaria_inc['Incidences'].quantile(0.75)
IQR = Q3 - Q1

## Calculate upper bound
upper_bound = Q3 + 1.5 * IQR

## Extract outliers
outliers = malaria_inc[malaria_inc['Incidences'] > Q3 + 1.5 * IQR]

## Get the entities of the outliers
entity = outliers['Entity'].reset_index(drop=True)

## Get median values of these entities
stats = malaria_inc['Incidences'].groupby(malaria_inc['Entity']).describe().reset_index()
median_values = stats[stats['Entity'].isin(entity)]

# Replace outliers with median values
malaria_inc.loc[(malaria_inc['Entity'] == 'Burkina Faso') & (malaria_inc['Year'] == 2000), 'Incidences'] = median_values.loc[(median_values['Entity'] == 'Burkina Faso'),'50%'].values[0]
malaria_inc.loc[(malaria_inc['Entity'] == 'Ethiopia') & (malaria_inc['Year'] == 2000), 'Incidences'] = median_values.loc[(median_values['Entity'] == 'Ethiopia'),'50%'].values[0]
malaria_inc.loc[(malaria_inc['Entity'] == 'Turkey') & (malaria_inc['Year'] == 2000), 'Incidences'] = median_values.loc[(median_values['Entity'] == 'Turkey'),'50%'].values[0]

##### Malaria Deaths by Age groups

In [None]:
# Remove duplicates
malaria_deaths_age.drop_duplicates()

# Summary
## Show information of the dataset
malaria_deaths_age.info()

## Show the statistics of the dataset
malaria_deaths_age['deaths'].describe()

##### Extract data by regions for further data exploratory analysis

In [None]:
# Extract data by regions
filtered_malaria_deaths_age_regions = malaria_deaths_age[(malaria_deaths_age['code'].isna())].reset_index(drop=True)

# Remove entities with income and SDI group
filtered_malaria_deaths_age_regions = filtered_malaria_deaths_age_regions[~filtered_malaria_deaths_age_regions['entity'].str.contains('income|SDI', case=False, na=False)]

# Show the statistics of the filtered dataset
filtered_malaria_deaths_age_regions['deaths'].groupby(filtered_malaria_deaths_age_regions['entity']).describe()


In [None]:
# Group regions by continents 
# Create a function that assigns the region by continent
def assign_continent(region):
    """
    Categorise regions by continents

    Parameters:
    - region: represents each value in the entity

    """
    
    if 'Africa' in region:
        return 'Africa'
    elif 'Asia' in region:
        return 'Asia'
    elif 'America' in region:
        return 'America'
    elif 'Europe' in region:
        return 'Europe'
    else:
        return region

# Apply function and create a new column called 'continent'
filtered_malaria_deaths_age_regions['continent'] = filtered_malaria_deaths_age_regions['entity'].apply(assign_continent)

#  Select the continents
selected_cont = ['Africa', 'Asia', 'America', 'Europe', 'Oceania']

# Filter out data that contains the selected continents
malaria_deaths_age_continents = filtered_malaria_deaths_age_regions[filtered_malaria_deaths_age_regions['continent'].isin(selected_cont)].reset_index(drop=True)
malaria_deaths_age_continents


# Part 2: Exploratory Data Analysis
### In this part of the exploratory data analysis, I will only be analyzing the data of the entire world and by continent.

In [None]:
# Display line charts of the worldly death rate and incidence rate across the years
## Create a figure with side by side plots
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

## Create a line plot to show the worldly death rate
sns.lineplot(x='Year', y='Deaths', 
            data=malaria_deaths[malaria_deaths['Entity'] == 'World'], # Get 'World' data only from dataset
            ax=axs[0],
            color='red', # Set line plot to red
            errorbar=None, 
            markers=True)

## Set title
axs[0].set_title('World Death rate from Malaria')
## Set y axis label
axs[0].set_ylabel('Deaths (per 100,000 people)')
## Set the y axis limit starting at 0
axs[0].set_ylim(bottom=0)

## Create a line plot to show the worldly incidence rate
sns.lineplot(x='Year', y='Incidences', 
            data=malaria_inc[malaria_inc['Entity'] == 'World'], # Get 'World' data only from dataset
            ax=axs[1],
            color='blue', # Set line plot to blue
            errorbar=None, 
            markers=True)

## Set title
axs[1].set_title('Incidence rate from Malaria')
## Set y axis label
axs[1].set_ylabel('Incidences (per 1,000 population at risk)')
## Set the y axis limit starting at 0
axs[1].set_ylim(bottom=0)

## Display plots
plt.tight_layout()
plt.show()

In [None]:
# Display line charts of the death rate by continents and age groups across the years 
# Create a plot
line_plot = sns.catplot(x='continent', y='deaths',
            data=malaria_deaths_age_continents,
            kind='bar',
            errorbar=None)
## Set title
plt.title('World Death Rate from Malaria by Continent')
## Set y axis label
plt.ylabel('Deaths (per 100,000 people)')
## Display plot
plt.show()

In [None]:
# Display line charts of the Africa's death rate across the years by age group
# Create a plot
death_age_plot = sns.relplot(x='year', y='deaths', data=malaria_deaths_age_continents[malaria_deaths_age_continents['continent'] == 'Africa'], 
            kind='line', 
            style='age_group', 
            hue='age_group',
            errorbar=None, 
            dashes=False)
## Set title
plt.title("Africa's Death rate from Malaria by Age Groups")
## Set y axis label
plt.ylabel('Deaths (per 100,000 people)')
## Set the y axis limit starting at 0
death_age_plot.ax.set_ylim(bottom=0)
## Display plot
plt.show()