# Conduct EDA on linked dataset and baseline model results

In [3]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
import os

## Load data directory

In [25]:
# linked data
DATA_DIR = Path("../../data/linked_data")

TAGGED_OUTBREAKS = DATA_DIR / "dengue_tagged_outbreaks"

In [26]:
CITY_DENGUE_FILES = os.listdir(TAGGED_OUTBREAKS)

## Load tagged cases

In [34]:
outbreak_dfs = []

for file in CITY_DENGUE_FILES:
    df = pd.read_csv(TAGGED_OUTBREAKS / file)
    df = df.drop(columns=["Unnamed: 0"])
    df["date"] = pd.to_datetime(df["date"])
    df = df[df["date"].dt.year >= 2008]
    outbreak_dfs.append(df)

In [35]:
print(CITY_DENGUE_FILES)

['davao_dengue_w_outbreak_tag.csv', 'zamboanga_dengue_w_outbreak_tag.csv', 'dagupan_dengue_city_merged_w_outbreak_tag.csv', 'legazpi_dengue_w_outbreak_tag.csv', 'mandaue_dengue_w_outbreak_tag.csv', 'palayan_dengue_w_outbreak_tag.csv', 'navotas_dengue_w_outbreak_tag.csv', 'iloilo_dengue_w_outbreak_tag.csv', 'muntinlupa_dengue_w_outbreak_tag.csv', 'mandaluyong_dengue_w_outbreak_tag.csv', 'tacloban_dengue_w_outbreak_tag.csv', 'cdo_dengue_w_outbreak_tag.csv']


In [36]:
outbreak_dfs[1]

Unnamed: 0,adm3_en,date,case_total_dengue,outbreak
262,Zamboanga City,2008-01-07,0.0,0
263,Zamboanga City,2008-01-14,8.0,0
264,Zamboanga City,2008-01-21,11.0,0
265,Zamboanga City,2008-01-28,18.0,0
266,Zamboanga City,2008-02-04,15.0,0
...,...,...,...,...
1039,Zamboanga City,2022-11-28,34.0,0
1040,Zamboanga City,2022-12-05,37.0,0
1041,Zamboanga City,2022-12-12,40.0,0
1042,Zamboanga City,2022-12-19,39.0,0


## 3. When are the outbreaks?

Plot the outbreak periods for certain cities. 

**TO DO:**
- Do this for all the cities now.

In [40]:
def plot_outbreak_cases_bar(
    df,
    city_name,
    disease_name,
    case_col="case_total_dengue",
    color_col="predicted_class",
):
    fig, ax = plt.subplots(figsize=(8, 5))

    # Define colors for each class
    class_colors = {0: "#53bed0", 1: "#ee472f"}

    # Plot the bar plot
    ax.bar(
        df["date"],
        df[case_col],
        alpha=0.5,
        label="Bar Plot",
        width=8,
        color=[class_colors[c] for c in df[color_col]],
    )

    # Rotate x-axis labels
    plt.xticks(rotation=30)

    # Create legend handles and labels for bar plot
    legend_handles = [
        plt.Rectangle((0, 0), 1, 1, color=color) for color in class_colors.values()
    ]
    legend_labels = list(class_colors.keys())

    # Add legend for bar plot
    ax.legend(
        legend_handles, legend_labels, fontsize="large", title="Predicted Outbreak"
    )
    # Add title and axis labels
    ax.set_title(f"{disease_name} Outbreak Periods for {city_name}")
    ax.set_xlabel("date")
    ax.set_ylabel("Number of Cases")
    plt.show()

In [None]:
for city_df in outbreak_dfs:
    try:
        city_name = str(city_df["adm3_en"].iloc[0])
        plot_outbreak_cases_bar(
            city_df,
            city_name=city_name,
            disease_name="Dengue",
            case_col="case_total_dengue",
            color_col="outbreak",
        )
    except:
        plot_outbreak_cases_bar(
            city_df,
            city_name="Muntinlupa",
            disease_name="Dengue",
            case_col="case_total_dengue",
            color_col="outbreak",
        )

## What are the specific start and end dates of each outbreak period?

In [42]:
# Define function to detect outbreak periods
def detect_outbreak_periods(tagged_df, target_class):
    """
    Args:
     tagged_df: dataframe that contains the outbreak tag
     target_class: class to summarize
    """
    # Create a boolean mask for rows with outbreak_tag = 1
    # tagged_df.sort_values(by=["ADM4_PCODE", "start_of_week"], inplace=True)
    tagged_df.sort_values(by=["date"], inplace=True)
    outbreak_mask = tagged_df[target_class] == 1

    # Calculate a group ID for each consecutive outbreak period within the same barangay
    outbreak_groups = (outbreak_mask != outbreak_mask.shift(fill_value=False)).cumsum()
    # Add the 'outbreak_groups' column to the DataFrame
    tagged_df["outbreak_group"] = outbreak_groups
    # Filter rows with outbreak_tag = 1
    outbreak_df = tagged_df[outbreak_mask]
    # Group by 'barangay' and 'outbreak_group' and calculate start date, end date, and length
    outbreak_summary = (
        outbreak_df.groupby(["outbreak_group"])  # adm4_pcode
        .agg(
            start_date=("date", "min"),
            end_date=("date", "max"),
            actual_length_weeks=(target_class, "count"),
        )
        .reset_index()
    )
    return outbreak_summary

In [52]:
from IPython.display import display

for city_df in outbreak_dfs:
    outbreak_summary = detect_outbreak_periods(city_df, "outbreak")
    try:
        print(city_df["adm3_en"].iloc[0])
    except:
        print("Muntinlupa")
    print(outbreak_summary["actual_length_weeks"].mean())
    display(outbreak_summary)

Davao City
12.666666666666666


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2010-06-21,2010-09-13,13
1,3,2012-03-05,2012-10-22,34
2,5,2013-07-22,2013-08-05,3
3,7,2014-06-23,2014-09-01,11
4,9,2016-07-04,2016-08-29,9
5,11,2019-07-15,2019-08-19,6


Zamboanga City
16.666666666666668


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2010-06-28,2010-08-16,8
1,3,2012-04-02,2012-12-24,39
2,5,2014-06-23,2014-08-18,9
3,7,2017-06-19,2017-07-10,4
4,9,2019-05-06,2019-10-14,24
5,11,2022-03-14,2022-06-27,16


Muntinlupa
7.166666666666667


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2014-08-11,2014-08-25,3
1,3,2015-08-03,2015-08-17,3
2,5,2015-08-31,2015-11-30,14
3,7,2016-05-30,2016-06-13,3
4,9,2016-07-25,2016-11-28,19
5,11,2017-07-24,2017-08-07,3
6,13,2017-08-21,2017-09-04,3
7,15,2017-10-16,2017-11-06,4
8,17,2018-01-08,2018-01-22,3
9,19,2018-08-06,2018-11-05,14


Legazpi City
9.0


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2009-11-16,2009-11-30,3
1,3,2010-06-21,2010-11-29,24
2,5,2013-07-22,2013-10-14,13
3,7,2013-12-02,2013-12-16,3
4,9,2014-09-08,2014-12-22,16
5,11,2015-01-05,2015-01-19,3
6,13,2015-12-14,2016-01-18,6
7,15,2016-02-08,2016-02-22,3
8,17,2016-09-19,2016-10-31,7
9,19,2016-11-28,2016-12-12,3


Mandaue City
10.88888888888889


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2010-08-30,2010-10-11,7
1,3,2012-08-27,2012-09-10,3
2,5,2015-10-05,2015-11-02,5
3,7,2016-07-11,2016-07-25,3
4,9,2016-08-08,2017-02-13,28
5,11,2018-08-20,2019-02-18,27
6,13,2019-09-23,2019-10-21,5
7,15,2019-11-11,2020-02-03,13
8,17,2022-05-30,2022-07-11,7


Palayan City
6.92


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2008-08-04,2008-08-18,3
1,3,2010-08-09,2010-08-23,3
2,5,2011-01-03,2011-01-17,3
3,7,2011-08-22,2011-10-03,7
4,9,2012-01-16,2012-02-06,4
5,11,2012-06-18,2012-09-03,12
6,13,2012-09-24,2012-10-08,3
7,15,2013-07-15,2013-09-16,10
8,17,2014-01-13,2014-01-27,3
9,19,2015-08-03,2015-09-28,9


City of Navotas
7.894736842105263


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2008-07-07,2008-07-21,3
1,3,2008-08-04,2008-10-06,10
2,5,2008-12-01,2008-12-15,3
3,7,2009-01-05,2009-01-19,3
4,9,2009-06-01,2009-07-27,9
5,11,2009-09-21,2009-10-05,3
6,13,2010-07-19,2010-12-06,21
7,15,2011-01-03,2011-01-31,5
8,17,2011-02-28,2011-03-14,3
9,19,2011-08-08,2011-10-03,9


Iloilo City
15.0


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2010-07-05,2010-10-04,14
1,3,2012-06-04,2012-09-10,15
2,5,2013-06-03,2013-09-02,14
3,7,2016-06-27,2016-10-10,16
4,9,2019-05-20,2019-10-28,24
5,11,2022-07-04,2022-08-15,7


City of Muntinlupa
10.272727272727273


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2014-08-18,2014-09-08,4
1,3,2014-10-06,2014-11-10,6
2,5,2014-12-01,2014-12-29,5
3,7,2015-01-26,2015-03-02,6
4,9,2015-07-27,2015-12-21,22
5,11,2016-01-18,2016-02-01,3
6,13,2016-09-19,2016-10-03,3
7,15,2017-09-11,2017-10-23,7
8,17,2017-11-13,2017-12-04,4
9,19,2018-09-03,2019-01-28,22


City of Mandaluyong
8.0


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2010-07-26,2010-10-25,14
1,3,2011-08-15,2011-09-12,5
2,5,2011-10-17,2011-10-31,3
3,7,2012-01-09,2012-02-13,6
4,9,2012-07-02,2012-12-24,26
5,11,2013-08-19,2013-09-16,5
6,13,2015-08-31,2015-11-02,10
7,15,2016-08-22,2016-09-26,6
8,17,2016-10-10,2016-10-24,3
9,19,2018-10-15,2018-11-12,5


Tacloban City
8.8


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2010-01-11,2010-03-29,12
1,3,2010-05-10,2010-09-27,21
2,5,2013-02-11,2013-02-25,3
3,7,2013-07-08,2013-08-26,8
4,9,2014-01-06,2014-03-24,12
5,11,2017-01-02,2017-01-16,3
6,13,2017-07-31,2017-08-14,3
7,15,2018-12-17,2019-01-14,5
8,17,2019-07-01,2019-10-28,18
9,19,2022-08-01,2022-08-15,3


Cagayan de Oro City
10.0


Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2010-08-02,2010-08-16,3
1,3,2012-06-04,2012-10-08,19
2,5,2014-07-14,2014-11-03,17
3,7,2016-08-01,2016-10-10,11
4,9,2018-08-13,2018-10-22,11
5,11,2018-12-10,2018-12-24,3
6,13,2019-07-15,2019-10-07,13
7,15,2019-11-11,2019-11-25,3


In [47]:
dagupan_dengue_outbreaks_sum = detect_outbreak_periods(dagupan_dengue, "outbreak")
dagupan_dengue_outbreaks_sum

Unnamed: 0,outbreak_group,start_date,end_date,actual_length_weeks
0,1,2014-08-11,2014-08-25,3
1,3,2015-08-03,2015-08-17,3
2,5,2015-08-31,2015-11-30,14
3,7,2016-05-30,2016-06-13,3
4,9,2016-07-25,2016-11-28,19
5,11,2017-07-24,2017-08-07,3
6,13,2017-08-21,2017-09-04,3
7,15,2017-10-16,2017-11-06,4
8,17,2018-01-08,2018-01-22,3
9,19,2018-08-06,2018-11-05,14
