In [1]:
# Gets historical weather data from WunderGround
# Using BeautifulSoup and Selenium to get the monthly weather stats

import requests, sys, re

from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.keys import Keys


In [2]:
def get_page(html, attempt=0, threshold=10):
    page = requests.get(html)
    try: 
        assert(199 < page.status_code < 300)
        return page
    except AssertionError:
        if attempt < threshold:
            return get_page(html, attempt + 1, threshold=threshold)
        else:
            raise Exception('Exceeded maximum attempts')

In [28]:
url = 'https://www.wunderground.com/history/monthly/KLGA/date/2020-3'

bi = FirefoxBinary(r'C:\Program Files (x86)\Mozilla Firefox\\firefox.exe')
br = webdriver.Firefox(firefox_binary=bi)

br.get(url)

# soup = bs(br.page_source, 'html.parser')
soup = bs(br.page_source, 'lxml')

br.quit()

tables = soup.find_all('table', {'class': 'days ng-star-inserted'})

In [52]:
# parse monthly weather data
def parse_table(table):
    category = 0
    # Hardcoded is bad but idk how to make this dynamic just hope wunderground doesn't change their site layout anytime soon
    headers = ['Time', 'Tempertaure (F)', 'Dew Point (F)', 'Humidity (%)', 'Wind Speed (mph)', 'Pressure (Hg)', 'Precipitation']
    row_data = {'Time': [], 'Tempertaure (F)': [], 'Dew Point (F)': [], 'Humidity (%)': [], 'Wind Speed (mph)': [], 'Pressure (Hg)': [], 'Precipitation': []}
    rows = table.find_all('tr')
    for i in range(2, len(rows)):
        row = rows[i].text.strip()
        if row.lower().islower():
            if i > 2:
                category += 1
        else:
            row_data[headers[category]].append(re.sub('\s+', ',', row))
    return row_data



def fetch_location_data(url):
    bi = FirefoxBinary(r'C:\Program Files (x86)\Mozilla Firefox\\firefox.exe')
    br = webdriver.Firefox(firefox_binary=bi)

    br.get(url)

    soup = bs(br.page_source, 'lxml')
    br.quit()

    table = soup.find_all('table', {'class': 'days ng-star-inserted'})
    print(parse_table(table[0]))

# hopefully converts a US location and time to wunderground url
def location_time_to_url(location, time):
    state, city = location
    year, month = time
    return 'https://www.wunderground.com/history/monthly/us/' + state + '/' + city + '/' + year + '-' + month

In [32]:
# Write all the tables into csv files:
for i in range(len(tables)):
    out_file = open('wunderground' + str(i + 4) + '.csv', 'w')
    table = tables[i]

    # # ---- Write the table header: ----
    # table_head = table.findAll('th')
    # output_head = []
    # for head in table_head:
    #     output_head.append(head.text.strip())

    # # Some cleaning and formatting of the text before writing:
    # header = '"' + '";"'.join(output_head) + '"'
    # header = re.sub('\s', '', header) + '\n'
    # out_file.write(header)

    # ---- Write the rows: ----
    output_rows = []
    rows = table.findAll('tr')
    for j in range(len(rows)):
        table_row = rows[j]
        columns = table_row.findAll('td')
        output_row = []
        for column in columns:
            output_row.append(column.text.strip())

        # Some cleaning and formatting of the text before writing:
        fila = '' + ','.join(output_row) + ''
        fila = re.sub('\s', '', fila) + '\n'
        out_file.write(fila)

    out_file.close()

In [53]:
for i in range(len(tables)):
    table = tables[i]
    output_rows = []
    rows = table.find_all('tr')
    for i in range(len(rows)):
        print(rows[i].text.strip())

TimeTemperature (° F)Dew Point (° F)Humidity (%)Wind Speed (mph)Pressure (Hg)Precipitation (in)
Mar  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  Max  Avg  Min  44  35.5  26  56  48.2  38  58  52.4  48  57  50.5  46  52  44.8  40  45  41.6  37  47  41.0  37  60  47.8  37  72  59.3  47  66  61.0  55  56  50.4  44  48  44.7  42  70  54.6  43  55  48.9  45  53  46.8  42  43  39.3  34  55  47.5  41  53  47.6  44  52  46.2  42  77  56.0  47  67  48.5  42  43  38.4  35  42  39.2  37  53  44.0  37  45  42.8  41  58  47.0  39  67  57.5  51  51  46.4  45  47  45.5  45  49  45.7  42  48  44.1  41  Max  Avg  Min  17  13.7  10  38  30.5  17  50  44.6  35  37  28.5  19  34  25.4  19  36  34.0  30  32  16.5  5  21  14.7  9  32  22.8  17  49  40.2  30  42  28.5  20  41  36.1  22  56  41.3  14  25  21.7  19  25  19.3  10  31  20.8  9  43  38.3  32  30  26.8  23  43  40.1  32  56  47.0  42  46  22.7  9  23  15.2  9  39  33.5  23  34 

In [54]:
fetch_location_data(url)

{'Time': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31'], 'Tempertaure (F)': ['44,35.5,26', '56,48.2,38', '58,52.4,48', '57,50.5,46', '52,44.8,40', '45,41.6,37', '47,41.0,37', '60,47.8,37', '72,59.3,47', '66,61.0,55', '56,50.4,44', '48,44.7,42', '70,54.6,43', '55,48.9,45', '53,46.8,42', '43,39.3,34', '55,47.5,41', '53,47.6,44', '52,46.2,42', '77,56.0,47', '67,48.5,42', '43,38.4,35', '42,39.2,37', '53,44.0,37', '45,42.8,41', '58,47.0,39', '67,57.5,51', '51,46.4,45', '47,45.5,45', '49,45.7,42', '48,44.1,41'], 'Dew Point (F)': ['17,13.7,10', '38,30.5,17', '50,44.6,35', '37,28.5,19', '34,25.4,19', '36,34.0,30', '32,16.5,5', '21,14.7,9', '32,22.8,17', '49,40.2,30', '42,28.5,20', '41,36.1,22', '56,41.3,14', '25,21.7,19', '25,19.3,10', '31,20.8,9', '43,38.3,32', '30,26.8,23', '43,40.1,32', '56,47.0,42', '46,22.7,9', '23,15.2,9', '39,33.5,23', '34,32.0,30', '39,34.0

In [48]:
print('hi'.islower())

True


In [55]:
a = 'HI'
a.lower()
print(a)

HI
