## Download daily weather data from wunderground for US airport stations, for last 30 years or so
- write raw csv files to AWS S3 bucket
- write all data to a sqlite database
- <https://github.com/andypicke/wunderground_data>

In [None]:
# Imports
import sqlite3
import pandas as pd
import boto3

In [None]:
# Define function to download data for a specified year and station
# Do some minor modifications, renaming of variables etc, but no cleaning yet

def get_wea_data_yearly(year,station):
    """
    Get historical (daily) weather data from wunderground for specified year and station
    """
    url = 'http://www.wunderground.com/history/airport/' + station + '/' + str(year) + '/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=' + str(year) + '&req_city=NA&req_state=NA&req_statename=NA&format=1'
    dat = pd.read_csv(url,parse_dates=True)
    # Name of date column is tz, which varies so we can't hardwire name
    dat.iloc[:,0] =  pd.to_datetime(dat.iloc[:,0])
    dat['date'] = dat.iloc[:,0]
    dat.set_index(dat.date, inplace=True)
    dat['yday']  = dat.date.dt.dayofyear
    dat['month'] = dat.date.dt.month
    dat['year']  = dat.date.dt.year
    dat.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    dat['st_code'] = station
    vars_to_keep = ['date','st_code','Max_TemperatureF','Min_TemperatureF','Mean_TemperatureF','year','yday','month','PrecipitationIn','_CloudCover','_Max_Gust_SpeedMPH','_Events']
    dat = dat[vars_to_keep]
    dat.rename(columns={'Max_TemperatureF':'max_temp','Min_TemperatureF':'min_temp','Mean_TemperatureF':'mean_temp','PrecipitationIn':'precip_In','_CloudCover':
        'cloud_cover','_Max_Gust_SpeedMPH':'max_gust_mph','_Events':'events'},inplace=True)    
    
    return dat



In [None]:
# test function
dat = get_wea_data_yearly(2000,'KRNO')
dat.head()


In [None]:
sta_list = pd.read_csv('USAirportWeatherStations.csv')
sta_list.head()

In [None]:
# If processing crashes, use below to restart where it left off
#sta_list[sta_list.airportCode=="KRNO"]
#stcodes = sta_list.airportCode.values
#stcodes[1069]
#stcodes = stcodes[1069:]


In [None]:
# Write to databse

con = sqlite3.connect('wunderground_daily.db')
years = range(1950,2017)
for sta in stcodes:
    print('getting data for ' + sta )
    for year in years:
        #print('getting data for ' + sta + ' for ' + str(year))
        try:
            dat = get_wea_data_yearly(year,sta)
            if (dat.shape[0]!=0):
                dat.to_sql("wea",con,if_exists='append',index=False)
                # write to S3 also
                s3 = boto3.resource('s3')
                #fname = csv_name
                key_name = sta + '_' + str(year) + '.csv'
                #data = open(fname, 'rb')
                data = dat.to_csv(None,index=False)
                s3.Bucket('wundergrounddaily').put_object(Key=key_name, Body=data)
        except:
            pass

            del dat
