# 1. Data Cleaning

### Webscraping

In [1]:
# import modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import io
import os
import time
import datetime
import warnings
warnings.filterwarnings("ignore")

The weather station locations and IDs from weather.gov.sg were retrieved manually in the interest of efficiency, however latitudes/longitudes were scraped from data.gov.sg API for sanity check to ensure station ID exists and the station locations from both websites are approximately similar. For instance, Admiralty in weather.gov.sg corresponds to Woodlands Avenue 9 in data.gov.sg, with both weather stations using the ID S104.

In [2]:
# manual extraction of station location and ID from weather.gov.sg

weather_gov_sg_stations={'Admiralty':'S104','Ang Mo Kio':'S109','Changi':'S24','Choa Chu Kang (South)':'S121','Clementi':'S50','East Coast Parkway':'S107',
                         'Jurong (West)':'S44','Jurong Island':'S117','Khatib':'S122','Marina Barrage':'S108','Newton':'S111','Pasir Panjang':'S116',
                         'Sentosa Island':'S60','Tai Seng':'S43','Tuas South':'S115','Admiralty West':'S105','Boon Lay (East)':'S86','Boon Lay (West)':'S63',
                         'Botanic Garden':'S120','Buangkok':'S55','Bukit Panjang':'S64','Bukit Timah':'S90','Buona Vista':'S92','Chai Chee':'S61',
                         'Choa Chu Kang (Central)':'S114','Choa Chu Kang (West)':'S11','Dhoby Ghaut':'S118','Jurong (East)':'S39','Jurong (North)':'S101',
                         'Jurong Pier':'S33','Kampong Bahru':'S31','Kent Ridge':'S71','Kranji Reservoir':'S66', 'Lim Chu Kang':'S112','Lower Peirce Reservoir':'S08',
                         'Macritchie Reservoir':'S07','Mandai':'S40','Marine Parade':'S113','Nicoll Highway':'S119','Pasir Ris (Central)':'S94',
                         'Pasir Ris (West)':'S29','Paya Lebar':'S06','Pulau Ubin':'S106','Punggol':'S81','Queenstown':'S77','Seletar':'S25','Semakau Island':'S102',
                         'Sembawang':'S80','Serangoon':'S36','Serangoon North':'S110','Simei':'S84','Somerset (Road)':'S79','Tanjong Katong':'S78',
                         'Tanjong Pagar':'S72','Tengah':'S23','Toa Payoh':'S88','Tuas':'S89','Tuas West':'S82','Ulu Pandan':'S35','Upper Peirce Reservoir':'S69',
                         'Upper Thomson':'S46','Whampoa':'S123','Yishun':'S91'}

In [3]:
# webscrape climate data from weather.gov.sg

year=[str(x) for x in range(2014,2019)]
month=[str(x) if x>=10 else '0'+str(x) for x in range(1,13)]
weather_gov_sg_url=[]
for i in weather_gov_sg_stations.values():
        for j in year:
            for k in month:
                weather_gov_sg_url.append('http://www.weather.gov.sg/files/dailydata/DAILYDATA_'+i+'_'+j+k+'.csv')
                
weather_gov_sg_url[0:5]+weather_gov_sg_url[-5:]

['http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201401.csv',
 'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201402.csv',
 'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201403.csv',
 'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201404.csv',
 'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201405.csv',
 'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S91_201808.csv',
 'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S91_201809.csv',
 'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S91_201810.csv',
 'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S91_201811.csv',
 'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S91_201812.csv']

In [4]:
# webscraping of latitude/longitude from data.gov.sg

res=requests.get('https://api.data.gov.sg/v1/environment/rainfall')
api_data=res.json()
api_data['metadata']['stations'][:5]

[{'id': 'S77',
  'device_id': 'S77',
  'name': 'Alexandra Road',
  'location': {'latitude': 1.2937, 'longitude': 103.8125}},
 {'id': 'S109',
  'device_id': 'S109',
  'name': 'Ang Mo Kio Avenue 5',
  'location': {'latitude': 1.3764, 'longitude': 103.8492}},
 {'id': 'S117',
  'device_id': 'S117',
  'name': 'Banyan Road',
  'location': {'latitude': 1.256, 'longitude': 103.679}},
 {'id': 'S64',
  'device_id': 'S64',
  'name': 'Bukit Panjang Road',
  'location': {'latitude': 1.3824, 'longitude': 103.7603}},
 {'id': 'S90',
  'device_id': 'S90',
  'name': 'Bukit Timah Road',
  'location': {'latitude': 1.3191, 'longitude': 103.8191}}]

In [5]:
# converting csv data to dataframe

weather_data=[]
for i in weather_gov_sg_url:
    res=requests.get(i)
    if res.status_code==200:
        result=pd.read_csv(io.StringIO(res.text))
        weather_data.append(result)
        
weather_df=pd.concat([x for x in weather_data],axis=0).reset_index(drop=True)
weather_df

Unnamed: 0,Station,Year,Month,Day,Daily Rainfall Total (mm),Highest 30 Min Rainfall (mm),Highest 60 Min Rainfall (mm),Highest 120 Min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
0,Admiralty,2014,1,1,0.0,0.0,0.0,0.0,26.3,28.8,24.3,10.8,34.2
1,Admiralty,2014,1,2,0.0,0.0,0.0,0.0,26.9,30.7,24.6,11.5,38.2
2,Admiralty,2014,1,3,4.6,4.4,4.4,4.6,26.5,29.5,24.9,9.7,34.2
3,Admiralty,2014,1,4,3.8,3.8,3.8,3.8,26.4,31.2,24.7,8.1,32.8
4,Admiralty,2014,1,5,3.6,1.2,1.6,2.0,24.6,25.3,23.6,8.4,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
107571,Yishun,2018,12,27,—,—,—,—,—,—,—,—,—
107572,Yishun,2018,12,28,—,—,—,—,—,—,—,—,—
107573,Yishun,2018,12,29,—,—,—,—,—,—,—,—,—
107574,Yishun,2018,12,30,—,—,—,—,—,—,—,—,—


In [6]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107576 entries, 0 to 107575
Data columns (total 13 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   Station                        107576 non-null  object
 1   Year                           107576 non-null  int64 
 2   Month                          107576 non-null  int64 
 3   Day                            107576 non-null  int64 
 4   Daily Rainfall Total (mm)      107576 non-null  object
 5   Highest 30 Min Rainfall (mm)   107576 non-null  object
 6   Highest 60 Min Rainfall (mm)   107576 non-null  object
 7   Highest 120 Min Rainfall (mm)  107576 non-null  object
 8   Mean Temperature (°C)          107576 non-null  object
 9   Maximum Temperature (°C)       107447 non-null  object
 10  Minimum Temperature (°C)       107435 non-null  object
 11  Mean Wind Speed (km/h)         107576 non-null  object
 12  Max Wind Speed (km/h)          107409 non-nu

In [7]:
# create new date column with datetime format

weather_df['date'] = pd.to_datetime(dict(year=weather_df.Year, month=weather_df.Month, day=weather_df.Day))

In [8]:
# create reference week column, using Ang Mo Kio as example since there is no missing daily weather data from this location

reference_date = weather_df[weather_df['Station']=='Ang Mo Kio'][['date','Year','Month','Day']]
reference_date['Week'] = None
reference_date=reference_date.reset_index(drop=True)

# manual week classification done from 1 Jan 2014 to 31 Dec 2018, where each week starts on Sunday and ends on Saturday

#year = 2014
x=4
y=1
while x+7<365:
    reference_date.loc[x:x+6,'Week']=y
    x+=7
    y+=1
reference_date.loc[reference_date[(reference_date['Year']==2014) & (reference_date['Week'].isna())].index,'Week']=52  


#year=2015
reference_date.loc[365:367,'Week']=52
x=368
y=1
while x+7<730:
    reference_date.loc[x:x+6,'Week']=y
    x+=7
    y+=1
reference_date.loc[reference_date[(reference_date['Year']==2015) & (reference_date['Week'].isna())].index,'Week']=52


#year=2016
reference_date.loc[730:731,'Week']=52
x=732
y=1
while x+7<1096:
    reference_date.loc[x:x+6,'Week']=y
    x+=7
    y+=1
reference_date.loc[reference_date[(reference_date['Year']==2016) & (reference_date['Week'].isna())].index,'Week']=52


#year=2017
x=1096
y=1
while x+7<1461:
    reference_date.loc[x:x+6,'Week']=y
    x+=7
    y+=1
reference_date.loc[reference_date[(reference_date['Year']==2017) & (reference_date['Week'].isna())].index,'Week']=1


#year=2018
reference_date.loc[1461:1466,'Week']=1
x=1467
y=2
while x+7<1826:
    reference_date.loc[x:x+6,'Week']=y
    x+=7
    y+=1


# filter out selected dates from subsequent analysis as they do not fall within the 1st week of 2014 and last week of 2018 
reference_date.loc[reference_date[reference_date['date']=='2014-01-01'].index,'Week']=None
reference_date.loc[reference_date[reference_date['date']=='2014-01-02'].index,'Week']=None                  
reference_date.loc[reference_date[reference_date['date']=='2014-01-03'].index,'Week']=None
reference_date.loc[reference_date[reference_date['date']=='2014-01-04'].index,'Week']=None
reference_date.loc[reference_date[reference_date['date']=='2018-12-30'].index,'Week']=None
reference_date.loc[reference_date[reference_date['date']=='2018-12-31'].index,'Week']=None

reference_date

Unnamed: 0,date,Year,Month,Day,Week
0,2014-01-01,2014,1,1,
1,2014-01-02,2014,1,2,
2,2014-01-03,2014,1,3,
3,2014-01-04,2014,1,4,
4,2014-01-05,2014,1,5,1
...,...,...,...,...,...
1821,2018-12-27,2018,12,27,52
1822,2018-12-28,2018,12,28,52
1823,2018-12-29,2018,12,29,52
1824,2018-12-30,2018,12,30,


In [9]:
# add week column to weather dataset

weather_df=weather_df.merge(reference_date,how='inner',on='date')

# drop extra columns and rename existing columns

weather_df=weather_df.drop(columns=['Year_y','Month_y','Day_y']).rename(columns={'Year_x':'Year','Month_x':'Month','Week_x':'Week','Day_x':'Day',
                                                                                 'Daily Rainfall Total (mm)':'Weekly Rainfall (mm)',
                                                                                 'Mean Temperature (°C)':'Weekly Mean Temperature (°C)',
                                                                                 'Maximum Temperature (°C)':'Weekly Maximum Temperature (°C)',
                                                                                 'Minimum Temperature (°C)':'Weekly Minimum Temperature (°C)',
                                                                                 'Mean Wind Speed (km/h)':'Weekly Mean Wind Speed (km/h)',
                                                                                 'Max Wind Speed (km/h)':'Weekly Maximum Wind Speed (km/h)'})
# replace '—' in weather dataset with nulls

weather_df=weather_df.replace('—',np.nan)

# convert numeric data to float

for i in ['Weekly Rainfall (mm)','Weekly Mean Temperature (°C)','Weekly Maximum Temperature (°C)',
          'Weekly Minimum Temperature (°C)','Weekly Mean Wind Speed (km/h)','Weekly Maximum Wind Speed (km/h)','Week']:
    weather_df[i]=weather_df[i].astype(float)

# filter out selected dates from subsequent analysis as they do not fall within the 1st week of 2014 and last week of 2018 

weather_df=weather_df.drop(index=weather_df[(weather_df['date']=='2018-12-30') | (weather_df['date']=='2018-12-31') | (weather_df['date']=='2014-01-01') |
                                            (weather_df['date']=='2014-01-02') | (weather_df['date']=='2014-01-03') | (weather_df['date']=='2014-01-04')].index)


# aggregation on weekly weather dataset

df_1=weather_df.groupby(['Station','Year','Week'])[['Weekly Rainfall (mm)']].sum().reset_index().drop(columns=['Station', 'Year', 'Week'])
df_2=weather_df.groupby(['Station','Year','Week'])[['Weekly Mean Temperature (°C)']].mean().reset_index().drop(columns=['Station', 'Year', 'Week'])
df_3=weather_df.groupby(['Station','Year','Week'])[['Weekly Maximum Temperature (°C)']].max().reset_index().drop(columns=['Station', 'Year', 'Week'])
df_4=weather_df.groupby(['Station','Year','Week'])[['Weekly Minimum Temperature (°C)']].min().reset_index().drop(columns=['Station', 'Year', 'Week'])
df_5=weather_df.groupby(['Station','Year','Week'])[['Weekly Mean Wind Speed (km/h)']].mean().reset_index().drop(columns=['Station', 'Year', 'Week'])
df_6=weather_df.groupby(['Station','Year','Week'])[['Weekly Maximum Wind Speed (km/h)']].max().reset_index().drop(columns=['Station', 'Year', 'Week'])

In [10]:
# merge all aggregated weather metrics together, result is in weekly format

df0=weather_df[['Station','Year','Week']].sort_values(by=['Station','Year','Week'],ascending=True).drop_duplicates().reset_index(drop=True)
df0['Year']=df0['Year'].astype(int)
df0['Week']=df0['Week'].astype(int)
weather_df=pd.concat([df0,df_1,df_2,df_3,df_4,df_5,df_6],axis=1)
weather_df

Unnamed: 0,Station,Year,Week,Weekly Rainfall (mm),Weekly Mean Temperature (°C),Weekly Maximum Temperature (°C),Weekly Minimum Temperature (°C),Weekly Mean Wind Speed (km/h),Weekly Maximum Wind Speed (km/h)
0,Admiralty,2014,1,90.0,25.971429,31.9,23.1,8.442857,37.4
1,Admiralty,2014,2,0.4,25.800000,29.6,23.5,13.542857,44.3
2,Admiralty,2014,3,0.0,25.328571,29.2,23.0,13.271429,41.8
3,Admiralty,2014,4,0.0,25.642857,32.4,21.8,13.028571,53.6
4,Admiralty,2014,5,15.8,26.028571,32.0,22.4,10.457143,35.6
...,...,...,...,...,...,...,...,...,...
15349,Yishun,2018,48,49.4,,,,,
15350,Yishun,2018,49,32.6,,,,,
15351,Yishun,2018,50,0.8,,,,,
15352,Yishun,2018,51,0.6,,,,,


We will be creating 2 weather datasets to model dengue cases in Singapore from 2014-2018. Both datasets will contain historical weather data, however the former will include latitudes and longitudes to examine the significance of location on dengue cases in Singapore, whereas the second dataset will use weather data and google trendwords to predict the number of cases.

---

### Dengue cases dataset for geospatial analysis

#### Weather dataset

In [11]:
# create latitude and longitude columns

weather_df['lat']=None
weather_df['long']=None

# input latitude and longitude and convert to float

for i in range(len(api_data['metadata']['stations'])):
    if api_data['metadata']['stations'][i]['device_id'] in weather_gov_sg_stations.values():
        key = [k for k, v in weather_gov_sg_stations.items() if v == api_data['metadata']['stations'][i]['device_id']]
        weather_df.loc[weather_df[weather_df['Station']==key[0]].index,'lat']=api_data['metadata']['stations'][i]['location']['latitude']
        weather_df.loc[weather_df[weather_df['Station']==key[0]].index,'long']=api_data['metadata']['stations'][i]['location']['longitude']
    else:
        pass

weather_df['lat']=weather_df['lat'].astype(float)
weather_df['long']=weather_df['long'].astype(float)

weather_df

Unnamed: 0,Station,Year,Week,Weekly Rainfall (mm),Weekly Mean Temperature (°C),Weekly Maximum Temperature (°C),Weekly Minimum Temperature (°C),Weekly Mean Wind Speed (km/h),Weekly Maximum Wind Speed (km/h),lat,long
0,Admiralty,2014,1,90.0,25.971429,31.9,23.1,8.442857,37.4,1.44387,103.78538
1,Admiralty,2014,2,0.4,25.800000,29.6,23.5,13.542857,44.3,1.44387,103.78538
2,Admiralty,2014,3,0.0,25.328571,29.2,23.0,13.271429,41.8,1.44387,103.78538
3,Admiralty,2014,4,0.0,25.642857,32.4,21.8,13.028571,53.6,1.44387,103.78538
4,Admiralty,2014,5,15.8,26.028571,32.0,22.4,10.457143,35.6,1.44387,103.78538
...,...,...,...,...,...,...,...,...,...,...,...
15349,Yishun,2018,48,49.4,,,,,,,
15350,Yishun,2018,49,32.6,,,,,,,
15351,Yishun,2018,50,0.8,,,,,,,
15352,Yishun,2018,51,0.6,,,,,,,


In [12]:
cases_df=pd.read_csv('../dataset/2014-2018 dengue cases.csv')
cases_df

Unnamed: 0,year,eweek,type_dengue,number
0,2014,1,Dengue,436.0
1,2014,1,DHF,1.0
2,2014,2,Dengue,479.0
3,2014,2,DHF,0.0
4,2014,3,Dengue,401.0
...,...,...,...,...
525,2018,51,DHF,1.0
526,2018,52,Dengue,160.0
527,2018,52,DHF,0.0
528,2018,53,Dengue,


In [13]:
cases_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530 entries, 0 to 529
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   year         530 non-null    int64  
 1   eweek        530 non-null    int64  
 2   type_dengue  530 non-null    object 
 3   number       522 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 16.7+ KB


In [14]:
# drop nulls
cases_df=cases_df.drop(index=cases_df[cases_df.isnull().any(axis=1)].index).reset_index(drop=True)

#rename column
cases_df.rename(columns={'year':'Year','eweek':'Week'},inplace=True)

# dengue cases
cases_df=cases_df.groupby(['Year','Week']).sum('number')
cases_df.reset_index(inplace=True)

# year and week
df=cases_df[['Year','Week']].drop_duplicates().reset_index(drop=True)

# merge
cases_df=pd.concat([df,cases_df['number']],axis=1)
cases_df=cases_df.astype(int)
cases_df.rename(columns={'number':'cases'},inplace=True)

# drop 2014-53 data
cases_df.drop(index=cases_df[(cases_df['Year']==2014) & (cases_df['Week']==53)].index,inplace=True)
cases_df.reset_index(drop=True).sort_values(by=['Year','Week'],ascending=True)

Unnamed: 0,Year,Week,cases
0,2014,1,437
1,2014,2,479
2,2014,3,401
3,2014,4,336
4,2014,5,234
...,...,...,...
255,2018,48,109
256,2018,49,114
257,2018,50,108
258,2018,51,128


In [15]:
# merge weather dataset with dengue cases dataset

weather_df=weather_df.merge(cases_df, on=['Year','Week'], how='inner')
weather_df

Unnamed: 0,Station,Year,Week,Weekly Rainfall (mm),Weekly Mean Temperature (°C),Weekly Maximum Temperature (°C),Weekly Minimum Temperature (°C),Weekly Mean Wind Speed (km/h),Weekly Maximum Wind Speed (km/h),lat,long,cases
0,Admiralty,2014,1,90.0,25.971429,31.9,23.1,8.442857,37.4,1.44387,103.78538,437
1,Admiralty West,2014,1,87.4,,,,,,,,437
2,Ang Mo Kio,2014,1,55.6,25.971429,31.8,23.3,5.257143,31.0,1.37640,103.84920,437
3,Boon Lay (East),2014,1,76.4,26.714286,32.3,23.4,7.200000,41.8,,,437
4,Boon Lay (West),2014,1,74.7,,,,,,,,437
...,...,...,...,...,...,...,...,...,...,...,...,...
15349,Ulu Pandan,2014,38,0.0,,,,,,1.33290,103.75560,397
15350,Upper Peirce Reservoir,2014,38,55.6,,,,,,1.37000,103.80500,397
15351,Upper Thomson,2014,38,25.4,,,,,,,,397
15352,Whampoa,2014,38,17.4,,,,,,1.32140,103.85770,397


In [16]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15354 entries, 0 to 15353
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Station                           15354 non-null  object 
 1   Year                              15354 non-null  int64  
 2   Week                              15354 non-null  int64  
 3   Weekly Rainfall (mm)              15354 non-null  float64
 4   Weekly Mean Temperature (°C)      4957 non-null   float64
 5   Weekly Maximum Temperature (°C)   5290 non-null   float64
 6   Weekly Minimum Temperature (°C)   5290 non-null   float64
 7   Weekly Mean Wind Speed (km/h)     5392 non-null   float64
 8   Weekly Maximum Wind Speed (km/h)  5368 non-null   float64
 9   lat                               10289 non-null  float64
 10  long                              10289 non-null  float64
 11  cases                             15354 non-null  int64  
dtypes: f

In [17]:
weather_df.duplicated().sum()

0

In [18]:
weather_df.to_csv('../dataset/all_stations_weather_daily_formatted.csv', index=False)

---
#### Cluster cases dataset

In [19]:
path='../dataset/clusters_csv/'
items=os.listdir('../dataset/clusters_csv/')

In [20]:
all_filenames = [path+i for i in items]

In [21]:
all_filenames[0:5]

['../dataset/clusters_csv/170320-clusters.csv',
 '../dataset/clusters_csv/150925-clusters.csv',
 '../dataset/clusters_csv/150922-clusters.csv',
 '../dataset/clusters_csv/170403-clusters.csv',
 '../dataset/clusters_csv/160704-clusters.csv']

In [22]:
#Column names to be added
column_names=["Number Of Cases","Street Address","Latitude","Longitude",
              "Cluster Number","Recent Cases In Cluster","Total Cases In Cluster","Date","Month Number"]

In [23]:
#Read one csv file and check the data
file_1 = pd.read_csv("../dataset/clusters_csv/150703-clusters.csv",header=None,names=column_names)
file_1

Unnamed: 0,Number Of Cases,Street Address,Latitude,Longitude,Cluster Number,Recent Cases In Cluster,Total Cases In Cluster,Date,Month Number
0,4,bishan street 22 (block 232),1.358286,103.845226,1,24,83,150703,7
1,7,bishan street 22 (block 233),1.358639,103.845259,1,24,83,150703,7
2,3,bishan street 22 (block 234),1.358390,103.845955,1,24,83,150703,7
3,1,bishan street 22 (block 235),1.358719,103.846477,1,24,83,150703,7
4,4,bishan street 22 (block 236),1.359041,103.846849,1,24,83,150703,7
...,...,...,...,...,...,...,...,...,...
130,1,woodlands drive 14 (block 515),1.434931,103.790309,40,2,2,150703,7
131,1,yishun avenue 2 (block 791),1.420556,103.833469,41,2,2,150703,7
132,1,yishun ring road (block 796),1.419719,103.833325,41,2,2,150703,7
133,1,yishun avenue 4 (block 653),1.423180,103.839630,42,2,2,150703,7


In [24]:
#Combine all csv files
combined_csv = pd.concat([pd.read_csv(f,header=None,names=column_names) for f in all_filenames],ignore_index=True)

In [25]:
#Output the combined csv file
combined_csv.to_csv( "../dataset/all_clusters.csv", index=False, encoding='utf-8-sig')

In [26]:
combined_csv

Unnamed: 0,Number Of Cases,Street Address,Latitude,Longitude,Cluster Number,Recent Cases In Cluster,Total Cases In Cluster,Date,Month Number
0,1,pasir ris street 71 (block 747),1.379194,103.934693,1,1,2,170320,3
1,1,pasir ris street 72 (whitewater),1.380042,103.935474,1,1,2,170320,3
2,2,bunga rampai place,1.338931,103.883537,1,1,32,150925,9
3,9,joo seng road (block 8),1.335219,103.878805,1,1,32,150925,9
4,3,mount vernon road,1.340377,103.879490,1,1,32,150925,9
...,...,...,...,...,...,...,...,...,...
20253,1,chai chee street (block 43),1.328380,103.925500,3,2,3,170515,5
20254,2,chai chee street (block 45),1.328806,103.924377,3,2,3,170515,5
20255,2,kang ching road (block 339d),1.338952,103.722240,4,2,2,170515,5
20256,1,lorong 4 toa payoh (block 60),1.336470,103.850664,5,2,2,170515,5


---

### Dengue cases dataset for regression modelling

In [27]:
# import trendwords

dengue_fever=pd.read_csv('../dataset/dengue_fever_2014-2018.csv')
headache=pd.read_csv('../dataset/headache_2014-2018.csv')
nausea=pd.read_csv('../dataset/nausea_2014-2018.csv')
vomitting=pd.read_csv('../dataset/vomitting_2014-2018.csv')
muscle_ache=pd.read_csv('../dataset/muscle_ache_2014-2018.csv')
rashes=pd.read_csv('../dataset/rashes_2014-2018.csv')
mosquito_bite=pd.read_csv('../dataset/mosquito_bite_2014-2018.csv')

In [28]:
# function to transform trendwords data into dataframe with year/week format

def transform_trend(x):
    week_num=pd.DataFrame(range(1,53))
    x=x.rename(columns=x.iloc[1]).drop(index=[0,1]).reset_index(drop=True)
    x.loc[:,'Week']=pd.concat([week_num,week_num,week_num,week_num,week_num],axis=0).reset_index(drop=True)
    x[x.columns[1]]=x[x.columns[1]].astype(int)
    return x

# an example of the function output

transform_trend(dengue_fever).head()

Unnamed: 0,Week,Dengue fever: (Singapore)
0,1,53
1,2,55
2,3,42
3,4,32
4,5,37


In [29]:
# one hot encoding for station

stations=list(weather_gov_sg_stations.keys())
weather_df2=weather_df[['Year','Week','cases']].drop_duplicates().reset_index(drop=True)
for i in range(len(stations)):
    df=weather_df[weather_df['Station']==stations[i]]
    for j in range(3,9):
        df=df.rename(columns={df.columns[j]:stations[i]+'_'+df.columns[j]})
    df=df.drop(columns=['Station','lat','long','cases'],axis=1).reset_index(drop=True)
    weather_df2=weather_df2.sort_values(by=['Year','Week'],ascending=True).merge(df,on=['Year','Week'], how='left')
weather_df2

Unnamed: 0,Year,Week,cases,Admiralty_Weekly Rainfall (mm),Admiralty_Weekly Mean Temperature (°C),Admiralty_Weekly Maximum Temperature (°C),Admiralty_Weekly Minimum Temperature (°C),Admiralty_Weekly Mean Wind Speed (km/h),Admiralty_Weekly Maximum Wind Speed (km/h),Ang Mo Kio_Weekly Rainfall (mm),...,Whampoa_Weekly Maximum Temperature (°C),Whampoa_Weekly Minimum Temperature (°C),Whampoa_Weekly Mean Wind Speed (km/h),Whampoa_Weekly Maximum Wind Speed (km/h),Yishun_Weekly Rainfall (mm),Yishun_Weekly Mean Temperature (°C),Yishun_Weekly Maximum Temperature (°C),Yishun_Weekly Minimum Temperature (°C),Yishun_Weekly Mean Wind Speed (km/h),Yishun_Weekly Maximum Wind Speed (km/h)
0,2014,1,437,90.0,25.971429,31.9,23.1,8.442857,37.4,55.6,...,,,,,,,,,,
1,2014,2,479,0.4,25.800000,29.6,23.5,13.542857,44.3,0.8,...,,,,,,,,,,
2,2014,3,401,0.0,25.328571,29.2,23.0,13.271429,41.8,0.0,...,,,,,,,,,,
3,2014,4,336,0.0,25.642857,32.4,21.8,13.028571,53.6,0.0,...,,,,,0.0,,,,,
4,2014,5,234,15.8,26.028571,32.0,22.4,10.457143,35.6,8.4,...,,,,,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,2018,48,109,71.6,27.371429,32.1,23.0,8.271429,47.2,70.2,...,,,,,49.4,,,,,
256,2018,49,114,65.8,26.785714,33.4,23.7,7.157143,50.8,71.8,...,,,,,32.6,,,,,
257,2018,50,108,72.0,26.628571,32.8,24.0,7.714286,43.6,42.8,...,,,,,0.8,,,,,
258,2018,51,128,21.4,27.471429,34.2,24.0,9.714286,52.2,65.8,...,,,,,0.6,,,,,


In [30]:
df2=weather_df[['Year','Week']].drop_duplicates().reset_index(drop=True)
for i in [dengue_fever,headache,nausea,vomitting,muscle_ache,rashes,mosquito_bite]:
    i=transform_trend(i)
    df2=pd.concat([df2,i],axis=1)
df2=pd.concat([df2.iloc[:,0:2],df2.iloc[:,3:16:2]],axis=1)
df2

Unnamed: 0,Year,Week,Dengue fever: (Singapore),Headache: (Singapore),Nausea: (Singapore),vomitting: (Singapore),muscle ache: (Singapore),rashes: (Singapore),Mosquito bite: (Singapore)
0,2014,1,53,69,43,0,29,87,0
1,2014,2,55,70,79,30,34,39,0
2,2014,3,42,73,35,0,0,72,17
3,2014,4,32,80,63,0,0,57,15
4,2014,5,37,52,44,0,0,52,37
...,...,...,...,...,...,...,...,...,...
255,2018,51,30,88,70,44,0,37,37
256,2018,52,30,75,48,0,27,54,28
257,2014,36,21,66,75,43,21,46,25
258,2014,37,27,73,68,32,17,60,20


In [31]:
weather_df2=weather_df2.merge(df2, on=['Year','Week'], how='left')
weather_df2

Unnamed: 0,Year,Week,cases,Admiralty_Weekly Rainfall (mm),Admiralty_Weekly Mean Temperature (°C),Admiralty_Weekly Maximum Temperature (°C),Admiralty_Weekly Minimum Temperature (°C),Admiralty_Weekly Mean Wind Speed (km/h),Admiralty_Weekly Maximum Wind Speed (km/h),Ang Mo Kio_Weekly Rainfall (mm),...,Yishun_Weekly Minimum Temperature (°C),Yishun_Weekly Mean Wind Speed (km/h),Yishun_Weekly Maximum Wind Speed (km/h),Dengue fever: (Singapore),Headache: (Singapore),Nausea: (Singapore),vomitting: (Singapore),muscle ache: (Singapore),rashes: (Singapore),Mosquito bite: (Singapore)
0,2014,1,437,90.0,25.971429,31.9,23.1,8.442857,37.4,55.6,...,,,,53,69,43,0,29,87,0
1,2014,2,479,0.4,25.800000,29.6,23.5,13.542857,44.3,0.8,...,,,,55,70,79,30,34,39,0
2,2014,3,401,0.0,25.328571,29.2,23.0,13.271429,41.8,0.0,...,,,,42,73,35,0,0,72,17
3,2014,4,336,0.0,25.642857,32.4,21.8,13.028571,53.6,0.0,...,,,,32,80,63,0,0,57,15
4,2014,5,234,15.8,26.028571,32.0,22.4,10.457143,35.6,8.4,...,,,,37,52,44,0,0,52,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,2018,48,109,71.6,27.371429,32.1,23.0,8.271429,47.2,70.2,...,,,,21,74,50,0,71,58,44
256,2018,49,114,65.8,26.785714,33.4,23.7,7.157143,50.8,71.8,...,,,,18,84,45,20,37,75,19
257,2018,50,108,72.0,26.628571,32.8,24.0,7.714286,43.6,42.8,...,,,,19,84,65,38,14,56,45
258,2018,51,128,21.4,27.471429,34.2,24.0,9.714286,52.2,65.8,...,,,,30,88,70,44,0,37,37


In [32]:
weather_df2=weather_df2.drop(columns=weather_df2.columns[weather_df2.isnull().sum()>0])

In [33]:
weather_df2.isnull().sum().sum()

0

In [34]:
weather_df2.duplicated().sum()

0

In [35]:
weather_df2.to_csv('../dataset/weather_data_formatted_weekly_rename_station_column_with_trend_words.csv',index=False)