# Scraping weather data 

From https://www.wunderground.com/history/daily/

Reference: 
- http://stanford.edu/~mgorkove/cgi-bin/rpython_tutorials/Scraping_a_Webpage_Rendered_by_Javascript_Using_Python.php
- https://automatetheboringstuff.com/chapter11/

In [198]:
import sys
sys.path.append(r'C:\Users\Benny\fastai\old')
from pathlib import Path
from fastai.imports import *
import requests
import wget
from bs4 import BeautifulSoup
from selenium import webdriver 
from datetime import datetime, time, date

In [230]:
browser = webdriver.Firefox() 

In [312]:
# pick a date range
start_date = datetime(2019, 1, 1)
stop_date = datetime(2019, 2, 13)
date_range = pd.date_range(start_date, stop_date).strftime('%Y-%m-%d')

In [313]:
len(date_range)

44

In [314]:
weather_df_2019 = pd.DataFrame()

for date in date_range:
    soup = get_soup(date)
    if len(soup.find_all('table'))==0:
        print(f'Cannot obtain data on {date}')
    else: 
        daily_df = get_data_soup(soup)
        #add date column
        daily_df['date'] = pd.to_datetime(date + ' ' +daily_df['Time'], format="%Y-%m-%d %I:%M %p")
        print(f'Finish obtaining data on {date}')
    
    weather_df_2019 = pd.concat([weather_df_2019,daily_df], axis=0, join='outer')
    print(len(weather_df_2019))

weather_df_2019.to_csv('data/weather_2019.csv')

Finish obtaining data on 2019-01-01
48
Finish obtaining data on 2019-01-02
96
Finish obtaining data on 2019-01-03
144
Finish obtaining data on 2019-01-04
192
Finish obtaining data on 2019-01-05
240
Finish obtaining data on 2019-01-06
288
Finish obtaining data on 2019-01-07
336
Finish obtaining data on 2019-01-08
384
Finish obtaining data on 2019-01-09
432
Finish obtaining data on 2019-01-10
480
Finish obtaining data on 2019-01-11
528
Finish obtaining data on 2019-01-12
576
Finish obtaining data on 2019-01-13
624
Finish obtaining data on 2019-01-14
672
Finish obtaining data on 2019-01-15
720
Finish obtaining data on 2019-01-16
768
Finish obtaining data on 2019-01-17
816
Finish obtaining data on 2019-01-18
864
Finish obtaining data on 2019-01-19
912
Finish obtaining data on 2019-01-20
960
Finish obtaining data on 2019-01-21
1008
Finish obtaining data on 2019-01-22
1056
Finish obtaining data on 2019-01-23
1104
Finish obtaining data on 2019-01-24
1152
Finish obtaining data on 2019-01-25
12

In [315]:
print(len(weather_df_2019)/48)
weather_df_2019.head()

44.0


Unnamed: 0,Time,Temperature(F),Dew Point(F),Humidity(%),Wind,Wind Speed(mph),Wind Gust(mph),Pressure(in),Precip.(in),Precip Accum(in),Condition,date
0,12:00 AM,79,64,61,NE,6,0,30.0,0.0,0.0,Fair,2019-01-01 00:00:00
0,12:30 AM,79,64,61,ENE,3,0,30.0,0.0,0.0,Fair,2019-01-01 00:30:00
0,1:00 AM,79,64,61,CALM,0,0,29.9,0.0,0.0,Fair,2019-01-01 01:00:00
0,1:30 AM,79,63,57,CALM,0,0,29.9,0.0,0.0,Fair,2019-01-01 01:30:00
0,2:00 AM,77,63,61,S,3,0,29.9,0.0,0.0,Fair,2019-01-01 02:00:00


In [260]:
# function for parsing html for a single date
def get_soup(date_str):
    url=f'https://www.wunderground.com/history/daily/th/bang-phut/VTBD/date/{date_str}'
    browser.get(url)
    innerhtml= browser.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(innerhtml)
    return soup

In [244]:
# extract daily_df from html table return daily_df 
def get_data_soup(soup):
    div_table=soup.find_all(attrs={"id": "history-observation-table"})[0]
    head=[ tag.string for tag in div_table.thead.find_all('button')]  
    daily_df=pd.DataFrame()
    #stripping unit
    f_row = trow[0]
    units=[]
    for unit_cell in f_row.find_all(class_="wu-label"):
        unit = ''.join(string for string in unit_cell.stripped_strings)
        units.append(r'('+unit+r')')
    
    units.insert(0, '')
    units.insert(4, '')
    units.append('')
    # stripping data 
    for row in trow:
        row_data = get_row_data(row)
        daily_df=daily_df.append(pd.DataFrame(row_data).T) 
        
    head_unit = [a+b for a, b in zip(head,units)]
    daily_df.columns = head_unit
    return daily_df

In [125]:
# extract row data 
def get_row_data(row):
    cells = row.find_all('ng-saw-cell-parser')
    row_data=[]
    for cell in cells:
        cell_data = [string for string in cell.stripped_strings]
        row_data.append(cell_data[0])
    return row_data