## Goal: write python script to download daily weather data from wunderground for US airport stations, for last 30 years or so
- write function to download a single station and year
- to clean and flag data (download raw first, then make cleaned files?)
- deal with download errors
- * write a dataframe that says which stations/years are missing?
- write all data to a sqlite database

In [1]:
# This script downloads daily historical weather data from wunderground,
# for citibike analysis project.

# Saves a modified file to sqlite3 database

import sqlite3
import pandas as pd
import boto3


def get_wea_data_yearly(year,station):
    """
    Get historical (daily) weather data from wunderground for specified year and station
    """
    url = 'http://www.wunderground.com/history/airport/' + station + '/' + str(year) + '/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=' + str(year) + '&req_city=NA&req_state=NA&req_statename=NA&format=1'
    dat = pd.read_csv(url,parse_dates=True)
    # Name of date column is tz, which varies so we can't hardwire name
    dat.iloc[:,0] =  pd.to_datetime(dat.iloc[:,0])
    dat['date'] = dat.iloc[:,0]
    dat.set_index(dat.date, inplace=True)
    dat['yday']  = dat.date.dt.dayofyear
    dat['month'] = dat.date.dt.month
    dat['year']  = dat.date.dt.year
    dat.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    dat['st_code'] = station
    vars_to_keep = ['date','st_code','Max_TemperatureF','Min_TemperatureF','Mean_TemperatureF','year','yday','month','PrecipitationIn','_CloudCover','_Max_Gust_SpeedMPH','_Events']
    dat = dat[vars_to_keep]
    dat.rename(columns={'Max_TemperatureF':'max_temp','Min_TemperatureF':'min_temp','Mean_TemperatureF':'mean_temp','PrecipitationIn':'precip_In','_CloudCover':
        'cloud_cover','_Max_Gust_SpeedMPH':'max_gust_mph','_Events':'events'},inplace=True)    
    
    return dat



In [6]:
# test function
dat = get_wea_data_yearly(2000,'KRNO')
dat.head()
#dat.info()

Unnamed: 0_level_0,date,st_code,max_temp,min_temp,mean_temp,year,yday,month,precip_In,cloud_cover,max_gust_mph,events
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2000-01-01,2000-01-01,KRNO,48,18.0,33.0,2000,1,1,0.0,5.0,28.0,Snow
2000-01-02,2000-01-02,KRNO,39,14.0,26.0,2000,2,1,0.0,4.0,,
2000-01-03,2000-01-03,KRNO,48,28.0,38.0,2000,3,1,0.0,5.0,,
2000-01-04,2000-01-04,KRNO,48,24.0,36.0,2000,4,1,0.0,3.0,28.0,
2000-01-05,2000-01-05,KRNO,42,21.0,32.0,2000,5,1,0.0,2.0,,


In [10]:
sta_list = pd.read_csv('USAirportWeatherStations.csv')
sta_list.head()

Unnamed: 0.1,Unnamed: 0,Station,State,airportCode,Lat,Lon,Elevation,WMO
0,1,Central,AK,PARL,65.57,-144.8,292,99999
1,2,Atka,AK,PAAK,52.22,-174.2,17,99999
2,3,Buckland,AK,PABL,65.99,-161.12,0,99999
3,4,Portage Glacier,AK,PATO,60.79,-148.83,29,99999
4,5,Nivalina,AK,PAVL,67.73,-164.55,3,70148


In [14]:
sta_list[sta_list.airportCode=="KRNO"]
stcodes = sta_list.airportCode.values
stcodes[1069]
stcodes = stcodes[1069:]
#type(stcodes)
stcodes[0:2]
#len(stcodes)

array(['KRNO', 'KTPH'], dtype=object)

In [15]:
# Write to databse

# start at PAVW

con = sqlite3.connect('wunderground_daily.db')
years = range(1950,2017)
for sta in stcodes:
    print('getting data for ' + sta )
    for year in years:
        #print('getting data for ' + sta + ' for ' + str(year))
        try:
            dat = get_wea_data_yearly(year,sta)
            if (dat.shape[0]!=0):
                dat.to_sql("wea",con,if_exists='append',index=False)
                # write to S3 also
                s3 = boto3.resource('s3')
                #fname = csv_name
                key_name = sta + '_' + str(year) + '.csv'
                #data = open(fname, 'rb')
                data = dat.to_csv(None,index=False)
                s3.Bucket('wundergrounddaily').put_object(Key=key_name, Body=data)
        except:
            pass

            del dat


getting data for KRNO
getting data for KTPH
getting data for KU31
getting data for KWMC
getting data for K9BB
getting data for KNYC
getting data for KDSV
getting data for KELZ
getting data for KFOK
getting data for KHWV
getting data for KALB
getting data for KPEO
getting data for KART
getting data for KBGM
getting data for KBUF
getting data for KDKK
getting data for KELM
getting data for KFRG
getting data for KGFL
getting data for KGTB
getting data for KHPN
getting data for KHTO
getting data for KIAG
getting data for KISP
getting data for KITH
getting data for KJFK
getting data for KFZY
getting data for KJHW
getting data for KMGJ
getting data for KLGA
getting data for KMSS
getting data for KMSV
getting data for KOGS
getting data for KPLB
getting data for KPOU
getting data for KROC
getting data for KSCH
getting data for KSLK
getting data for KSWF
getting data for KSYR
getting data for KUCA
getting data for KMTP
getting data for KAKR
getting data for KTZR
getting data for KPHD
getting da

getting data for K1V4
getting data for KBTV
getting data for KMPV
getting data for KMVL
getting data for KRUT
getting data for KVSF
getting data for KDDH
getting data for KDEW
getting data for KALW
getting data for KAWO
getting data for KBFI
getting data for KCQV
getting data for KBLI
getting data for KCLM
getting data for KEAT
getting data for KELN
getting data for KEPH
getting data for KFHR
getting data for KGEG
getting data for KGRF
getting data for KHMS
getting data for KHQM
getting data for KOMK
getting data for KKLS
getting data for KMWH
getting data for KNUW
getting data for KOLM
getting data for KPAE
getting data for KPSC
getting data for KPUW
getting data for KPWT
getting data for KRNT
getting data for KSEA
getting data for KSFF
getting data for KSHN
getting data for KSKA
getting data for KSMP
getting data for KTCM
getting data for KTDO
getting data for KTIW
getting data for KUIL
getting data for KVUO
getting data for KYKM
getting data for KRNH
getting data for KRRL
getting da

In [None]:
dat