# Hong Kong Weather: Data Collection & Processing

## 1. Load Required Python Packages

In [1]:
# load python packages from environment
import os
import requests
from bs4 import BeautifulSoup
import lxml
import html5lib
import seaborn as sns
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

<b>Locate and print current working directory.</b>

In [2]:
path = os.getcwd()
print(path)

/Users/tiffanyflor/Dropbox/MyProjects/HongKongPollution/HongKongPollution/notebooks


<b>Get parent directory and append path to raw data.</b>

In [3]:
# Create raw data path by getting parent directory of current working directory and appending data/processed
parent_path = os.path.dirname(path)
rawdata_path = os.path.join(parent_path, 'data', 'raw')
print(rawdata_path)

/Users/tiffanyflor/Dropbox/MyProjects/HongKongPollution/HongKongPollution/data/raw


## 2. Retrieve and Save Datasets (Webscraping)
Webscrape data from http://www.weather.gov.hk/en/cis/dailyExtract.htm

In [170]:
url = 'https://www.timeanddate.com/weather'

year = 2018
month = 12
day = 5

payload = {
    'n': 'china/kowloon',
    'mode': 'historic',
    'hd': '%d%02d%02d' %(year, month, day),
    'month': '%02d' %(month),
    'year': '%d' %(year)}

In [171]:
# Start loop here for each page/day, month, year from Jan 1, 2014 - Sept 30, 2020
month_list = np.arange(1,13)
year_list = list(map(str, np.arange(2014,2020)))
#month = '7'
#year = '2020'

#page=requests.get(url.format(month, year), )
pagge = requests.get(url, params=payload)

# Check status code to ensure it worked
page.status_code

200

In [172]:
soup=BeautifulSoup(page.text, 'html.parser')

In [173]:
print(soup.prettify())

<!DOCTYPE html>
<!--
scripts and programs that download content transparent to the user are not allowed without permission
-->
<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   Weather in July 2020 in Kowloon, Hong Kong
  </title>
  <meta content="Weather reports from July 2020 in Kowloon, Hong Kong with highs and lows" name="description"/>
  <meta content="past, historic, recent, wether, weather, temp, temps, temperature, celcius, celsius, fahrenheit, sunny, sun, clouds, cloudes, cloudy, windy, forecasts, forecast, met, meterology, metrology, Kowloon, local, Hong Kong, VHHH, July, 2020" name="keywords"/>
  <meta content="https://www.timeanddate.com/scripts/cityog.php?title=Past%20Weather%20in&amp;tint=0x007b7a&amp;city=Kowloon&amp;country=Hong%20Kong&amp;image=kowloon1" property="og:image"/>
  <meta content="1366" property="og:image:width"/>
  <meta content="738" property="og:image:height"/>
  <meta content="website" property=

In [174]:
Data=[]
table=soup.find('table', attrs={'id':'wt-his'})
for tr in table.find('tbody').find_all('tr'):
   dict = {}
   try:
       dict['date'] = tr.find('th').text.split(',')[1] + ', ' + year
   except:
       pass
   dict['time'] = tr.find('th').text.split('m')[0]
   all_td = tr.find_all('td')
   dict['temp (°F)'] = all_td[1].text.split('°')[0]
   dict['weather'] = all_td[2].text
   dict['wind (mph)'] = all_td[3].text.split(' ')[0]
   dict['direction (°)'] = all_td[4].span.attrs['title'].split('from ')[1].split('°')[0]


   dict['humidity (%)'] = all_td[5].text.split('%')[0]
   dict['barometer ("Hg)'] = all_td[6].text.split(' ')[0]

   Data.append(dict)

In [175]:
df = pd.DataFrame(Data)
#df.date = df.date.fillna(method='ffill')
df.head()

Unnamed: 0,time,temp (°F),weather,wind (mph),direction (°),humidity (%),"barometer (""Hg)"
0,12:00 a,86,Passing clouds.,12,150,79,29.68
1,12:30 a,86,Passing clouds.,7,150,75,29.68
2,1:00 a,86,Passing clouds.,7,140,75,29.65
3,1:30 a,86,Passing clouds.,3,0,79,29.65
4,2:00 a,86,Passing clouds.,6,100,79,29.65


In [176]:
df.weather.nunique()

5

In [189]:
url = 'https://www.timeanddate.com/scripts/cityajax.php'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}

year = 2018
month = 12
day = 1
data = requests.get(url, headers=headers, params=payload).text
table = pd.read_html('<table>' + data + '</table>')[0][:-1]
table = table.dropna(axis=1)

In [190]:
table.head(20)

Unnamed: 0_level_0,Unnamed: 0_level_0,Conditions,Conditions,Comfort,Comfort,Comfort,Unnamed: 7_level_0
Unnamed: 0_level_1,Time,Temp,Weather,Wind,Unnamed: 5_level_1,Humidity,Barometer
0,"12:00 amWed, Dec 5",77 °F,Passing clouds.,3 mph,↑,74%,"30.01 ""Hg"
1,12:30 am,77 °F,Passing clouds.,6 mph,↑,79%,"30.01 ""Hg"
2,1:00 am,77 °F,Passing clouds.,3 mph,↑,79%,"29.98 ""Hg"
3,1:30 am,77 °F,Passing clouds.,3 mph,↑,74%,"29.98 ""Hg"
4,2:00 am,75 °F,Passing clouds.,3 mph,↑,83%,"29.98 ""Hg"
5,2:30 am,77 °F,Passing clouds.,2 mph,↑,74%,"29.98 ""Hg"
6,3:00 am,75 °F,Passing clouds.,3 mph,↑,78%,"29.98 ""Hg"
7,3:30 am,75 °F,Passing clouds.,3 mph,↑,78%,"29.98 ""Hg"
8,4:00 am,75 °F,Passing clouds.,3 mph,↑,78%,"29.98 ""Hg"
9,4:30 am,75 °F,Passing clouds.,5 mph,↑,78%,"29.98 ""Hg"
