In [14]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import warnings
warnings.filterwarnings("ignore")

## ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder, PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from tasks import Understand

# 1. Data Understanding
> Inspecting the data

In [3]:
# load data
chickenpox_df = pd.read_csv('data/hungary_chickenpox.csv')
county_edge_df = pd.read_csv('data/hungary_county_edges.csv')

# Preview chickenpox_df - 1st 5 rows
chickenpox_df.head()

Unnamed: 0,Date,BUDAPEST,BARANYA,BACS,BEKES,BORSOD,CSONGRAD,FEJER,GYOR,HAJDU,...,JASZ,KOMAROM,NOGRAD,PEST,SOMOGY,SZABOLCS,TOLNA,VAS,VESZPREM,ZALA
0,03/01/2005,168,79,30,173,169,42,136,120,162,...,130,57,2,178,66,64,11,29,87,68
1,10/01/2005,157,60,30,92,200,53,51,70,84,...,80,50,29,141,48,29,58,53,68,26
2,17/01/2005,96,44,31,86,93,30,93,84,191,...,64,46,4,157,33,33,24,18,62,44
3,24/01/2005,163,49,43,126,46,39,52,114,107,...,63,54,14,107,66,50,25,21,43,31
4,31/01/2005,122,78,53,87,103,34,95,131,172,...,61,49,11,124,63,56,7,47,85,60


In [4]:
# Preview chickenpox_df - last 5 rows
chickenpox_df.tail()

Unnamed: 0,Date,BUDAPEST,BARANYA,BACS,BEKES,BORSOD,CSONGRAD,FEJER,GYOR,HAJDU,...,JASZ,KOMAROM,NOGRAD,PEST,SOMOGY,SZABOLCS,TOLNA,VAS,VESZPREM,ZALA
517,01/12/2014,95,12,41,6,39,0,16,15,14,...,56,7,13,122,4,23,4,11,110,10
518,08/12/2014,43,39,31,10,34,3,2,30,25,...,34,20,18,70,36,5,23,22,63,9
519,15/12/2014,35,7,15,0,0,0,7,7,4,...,30,36,4,72,5,21,14,0,17,10
520,22/12/2014,30,23,8,0,11,4,1,9,10,...,27,17,21,12,5,17,1,1,83,2
521,29/12/2014,259,42,49,32,38,15,11,98,61,...,112,61,53,256,45,39,27,11,103,25


In [15]:
# An object of class understand
understand_pox = Understand(chickenpox_df)

# Inspecting the chickenpox_df
understand_pox.inspect_df()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      522 non-null    object
 1   BUDAPEST  522 non-null    int64 
 2   BARANYA   522 non-null    int64 
 3   BACS      522 non-null    int64 
 4   BEKES     522 non-null    int64 
 5   BORSOD    522 non-null    int64 
 6   CSONGRAD  522 non-null    int64 
 7   FEJER     522 non-null    int64 
 8   GYOR      522 non-null    int64 
 9   HAJDU     522 non-null    int64 
 10  HEVES     522 non-null    int64 
 11  JASZ      522 non-null    int64 
 12  KOMAROM   522 non-null    int64 
 13  NOGRAD    522 non-null    int64 
 14  PEST      522 non-null    int64 
 15  SOMOGY    522 non-null    int64 
 16  SZABOLCS  522 non-null    int64 
 17  TOLNA     522 non-null    int64 
 18  VAS       522 non-null    int64 
 19  VESZPREM  522 non-null    int64 
 20  ZALA      522 non-null    int64 
dtypes: int64(20), ob

{'Dimensions': 'This data set has 522 rows and 21 columns',
 'Duplicates': 'The data has 0 duplicated entries and 522 non duplicated entries',
 'Missing values (%)': '0.0 % of the data has missing values',
 'Summary statistics':           count        mean        std  min    25%   50%     75%    max
 BUDAPEST  522.0  101.245211  76.354872  0.0  34.25  93.0  149.00  479.0
 BARANYA   522.0   34.204981  32.567222  0.0   8.00  25.0   51.00  194.0
 BACS      522.0   37.166667  36.843095  0.0   8.00  29.5   53.00  274.0
 BEKES     522.0   28.911877  37.618092  0.0   4.00  14.0   38.75  271.0
 BORSOD    522.0   57.082375  50.725437  0.0  14.25  46.5   83.75  355.0
 CSONGRAD  522.0   31.488506  33.790208  0.0   6.00  20.5   47.00  199.0
 FEJER     522.0   33.272031  31.397989  0.0   7.00  24.0   51.75  164.0
 GYOR      522.0   41.436782  36.014297  0.0   9.00  35.0   63.00  181.0
 HAJDU     522.0   47.097701  44.610836  0.0  11.00  37.0   68.00  262.0
 HEVES     522.0   29.691571  31.857750  0

In [8]:
# Preview county edges
county_edge_df.head()

Unnamed: 0,name_1,name_2,id_1,id_2
0,BACS,JASZ,0,10
1,BACS,BACS,0,0
2,BACS,BARANYA,0,1
3,BACS,CSONGRAD,0,5
4,BACS,PEST,0,13


In [9]:
# Last 5 rows
county_edge_df.tail()

Unnamed: 0,name_1,name_2,id_1,id_2
97,VESZPREM,SOMOGY,18,14
98,ZALA,VAS,19,17
99,ZALA,ZALA,19,19
100,ZALA,SOMOGY,19,14
101,ZALA,VESZPREM,19,18


In [16]:
# object of class Understand
understand_county = Understand(county_edge_df)

# Inspect the data frame
understand_county.inspect_df()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name_1  102 non-null    object
 1   name_2  102 non-null    object
 2   id_1    102 non-null    int64 
 3   id_2    102 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 3.3+ KB


{'Dimensions': 'This data set has 102 rows and 4 columns',
 'Duplicates': 'The data has 0 duplicated entries and 102 non duplicated entries',
 'Missing values (%)': '0.0 % of the data has missing values',
 'Summary statistics':       count      mean       std  min  25%   50%   75%   max
 id_1  102.0  9.598039  5.728615  0.0  5.0  10.0  14.0  19.0
 id_2  102.0  9.598039  5.728615  0.0  5.0  10.0  14.0  19.0,
 'Info (printed above)': None}

#### Observations:
* There are no missing values in this data
* **chickenpox_df** contains continuous data while **county_edge_df** contains discrete data.
* The data is a time series arranged weekly.
* The minimum value of case counts for each county is 0 and the maximum value varies
* The counties with the highest mean count during the duration between 2005 and 2015 is Budapest and Pest - this may be attributed to high population.
* All the columns in **chickenpox_df** with the exception of the `date` column represent counties in Hungary.
* County codes are consistent with the official Hungarian statistics (https://www.ksh.hu/?lang=en).


# Data Preparation
> Since most of the aspects of the data such as validity, uniformity, consistency and completeness look okay, we shall mostly be doing feature engineering.

In [17]:
# Looping through each column in chickenpox_df to show the distribution of values in each of the columns
for col in chickenpox_df.columns:
    print(f"Column: {col}")
    print(f"Data type: {chickenpox_df[col].dtype}")
    print(f"Number of unique values: {chickenpox_df[col].nunique()}")
    print("--------")

Column: Date
Data type: object
Number of unique values: 522
--------
Column: BUDAPEST
Data type: int64
Number of unique values: 217
--------
Column: BARANYA
Data type: int64
Number of unique values: 114
--------
Column: BACS
Data type: int64
Number of unique values: 120
--------
Column: BEKES
Data type: int64
Number of unique values: 110
--------
Column: BORSOD
Data type: int64
Number of unique values: 160
--------
Column: CSONGRAD
Data type: int64
Number of unique values: 113
--------
Column: FEJER
Data type: int64
Number of unique values: 110
--------
Column: GYOR
Data type: int64
Number of unique values: 123
--------
Column: HAJDU
Data type: int64
Number of unique values: 140
--------
Column: HEVES
Data type: int64
Number of unique values: 105
--------
Column: JASZ
Data type: int64
Number of unique values: 126
--------
Column: KOMAROM
Data type: int64
Number of unique values: 86
--------
Column: NOGRAD
Data type: int64
Number of unique values: 82
--------
Column: PEST
Data type: int

In [18]:
# Looping through each column in county_edge_df to show the distribution of values in each of the columns
for col in county_edge_df.columns:
    print(f"Column: {col}")
    print(f"Data type: {county_edge_df[col].dtype}")
    print(f"Number of unique values: {county_edge_df[col].nunique()}")
    print("--------")

Column: name_1
Data type: object
Number of unique values: 20
--------
Column: name_2
Data type: object
Number of unique values: 20
--------
Column: id_1
Data type: int64
Number of unique values: 20
--------
Column: id_2
Data type: int64
Number of unique values: 20
--------


In [19]:
county_edge_df

Unnamed: 0,name_1,name_2,id_1,id_2
0,BACS,JASZ,0,10
1,BACS,BACS,0,0
2,BACS,BARANYA,0,1
3,BACS,CSONGRAD,0,5
4,BACS,PEST,0,13
...,...,...,...,...
97,VESZPREM,SOMOGY,18,14
98,ZALA,VAS,19,17
99,ZALA,ZALA,19,19
100,ZALA,SOMOGY,19,14
