# Granularity
Standardise the data such that they have the same granularity.  

 * Weekly from Week #17 2012
 * Locale : Singapore

Import modules:-

In [None]:
import pandas as pd
import numpy as np
import os
import glob
import re
from datetime import date

### Set directory paths:-

In [None]:
raw_path = "../../data/1_raw/"
out_path = "../../data/2_interim/"
clean_path = "../../data/3_clean/"

if not os.path.exists(out_path):
    os.makedirs(out_path)
if not os.path.exists(clean_path):
    os.makedirs(clean_path)

### Compile weather data into Weekly, Singapore average format

In [None]:
weather_files = glob.glob(raw_path+'weather/*.csv')
len(weather_files)

#### Create DataFrame from file:

In [None]:
dfWeather = pd.read_csv(weather_files.pop(), encoding='latin_1')
for f in weather_files:
    df = pd.read_csv(f, encoding='latin_1')
    dfWeather = dfWeather.append(df)

In [None]:
dfWeather.describe()

In [None]:
dfWeather.head()

#### Remove unecessary columns:

In [None]:
dfWeather.columns

In [None]:
dfWeather = dfWeather.drop(['Station', 'Highest 30 Min Rainfall (mm)', 
                            'Highest 60 Min Rainfall (mm)',
                           'Highest 120 Min Rainfall (mm)'], axis=1)

#### Rename column headers into usable labels, i.e. standard ASCII UTF-8 characters.

In [None]:
dfWeather.columns

In [None]:
dfWeather.columns = ['Year', 'Month', 'Day', 'Daily Rainfall Total',
       'Mean Temperature', 'Maximum Temperature',
       'Minimum Temperature', 'Mean Wind Speed',
       'Max Wind Speed']

In [None]:
dfWeather.columns

#### Deal with missing / invalid data

In [None]:
dfWeather = dfWeather.fillna(np.nan)

In [None]:
for col in dfWeather:
    dfWeather[col] = pd.to_numeric(dfWeather[col], errors='coerce')

In [None]:
dfWeather.describe()

#### Deal with improbable data:

In [None]:
dfWeather['Minimum Temperature'] = dfWeather['Minimum Temperature'].apply(lambda x: np.nan if x==0 else x)

In [None]:
dfWeather.describe()

#### Get week number:

In [None]:
def get_weeknum(row):
    isocal = date(int(row['Year']), int(row['Month']), int(row['Day'])).isocalendar()
    weeknum = str(isocal[0]) + '-W' + "%02d"%isocal[1] 
    return weeknum

In [None]:
dfWeather['Y-Week'] = dfWeather.apply(lambda row: get_weeknum(row), axis=1)

In [None]:
dfWeather.head()

#### Recalculate data by week number:

In [None]:
group_dict = {'Minimum Temperature':['min'], 
              'Daily Rainfall Total':['mean'], 
              'Mean Temperature':['mean'], 
              'Mean Wind Speed':['mean'], 
              'Maximum Temperature':['max'], 
              'Max Wind Speed':['max']}

In [None]:
groupedWeather = dfWeather.groupby(['Y-Week']).agg(group_dict)


In [None]:
groupedWeather.head()

In [None]:
groupedWeather.columns

#### Rename columns:

In [None]:
groupedWeather.columns = [col[0] for col in groupedWeather.columns]

In [None]:
groupedWeather.columns

In [None]:
groupedWeather.head()

In [None]:
groupedWeather.rename(index=str, columns={'Daily Rainfall Total':'Mean Daily Rainfall Total'})

#### Save to file:

In [None]:
groupedWeather.to_csv(out_path+'weather.csv')

# Dengue and Malaria data

In [None]:
dfDisease = pd.read_csv(raw_path+"weekly-infectious-disease-bulletin-cases.csv", index_col=0)

In [None]:
dfDisease = dfDisease[(dfDisease['disease']=='Malaria') |
                      (dfDisease['disease']=='Dengue Fever') | 
                      (dfDisease['disease']=='Dengue Haemorrhagic Fever')]

In [None]:
dfDisease.head()

### Pivoting

In [None]:
dfDisease = dfDisease.pivot(columns = 'disease', values = 'no._of_cases')

In [None]:
dfDisease['Dengue'] = dfDisease['Dengue Fever']+dfDisease['Dengue Haemorrhagic Fever']

In [None]:
dfDisease = dfDisease.drop(['Dengue Fever', 'Dengue Haemorrhagic Fever'], axis=1)

#### Save to file

In [None]:
dfDisease.to_csv(out_path+'disease.csv')

# Combine Data

In [None]:
dfData = pd.concat([groupedWeather, dfDisease], axis=1)

In [None]:
dfData.head()

In [None]:
dfData[-5:]

#### Remove invalid rows with no data:

In [None]:
dfData = dfData.dropna(subset = ['Malaria','Dengue'], how='all')

In [None]:
dfData.describe()

In [None]:
dfData.head()

### Reformat indexing

In [None]:
YearWeek = [[int(x) for x in item.split('-W')] for item in dfData.index]


In [None]:
dfYearWeek = pd.DataFrame.from_records(YearWeek, index = dfData.index, columns = ['Year', 'Week'])

In [None]:
dfDataYW = pd.concat([dfData,dfYearWeek], axis=1)

In [None]:
dfDataYW = dfDataYW.set_index(['Year','Week'])

In [None]:
dfDataYW.head()

#### Save to file

In [None]:
dfDataYW.to_csv(clean_path+'data.csv')