## Goal: write python script to download daily weather data from wunderground for US airport stations, for last 30 years or so
- write function to download a single station and year
- to clean and flag data (download raw first, then make cleaned files?)
- deal with download errors
- * write a dataframe that says which stations/years are missing?
- write all data to a sqlite database

In [1]:
# This script downloads daily historical weather data from wunderground,
# for citibike analysis project.

# Saves a modified file to sqlite3 database

import sqlite3
import pandas as pd
import boto3


def get_wea_data_yearly(year,station):
    """
    Get historical (daily) weather data from wunderground for specified year and station
    """
    url = 'http://www.wunderground.com/history/airport/' + station + '/' + str(year) + '/1/1/CustomHistory.html?dayend=31&monthend=12&yearend=' + str(year) + '&req_city=NA&req_state=NA&req_statename=NA&format=1'
    dat = pd.read_csv(url,parse_dates=True)
    # Name of date column is tz, which varies so we can't hardwire name
    dat.iloc[:,0] =  pd.to_datetime(dat.iloc[:,0])
    dat['date'] = dat.iloc[:,0]
    dat.set_index(dat.date, inplace=True)
    dat['yday']  = dat.date.dt.dayofyear
    dat['month'] = dat.date.dt.month
    dat['year']  = dat.date.dt.year
    dat.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    dat['st_code'] = station
    vars_to_keep = ['date','st_code','Max_TemperatureF','Min_TemperatureF','Mean_TemperatureF','year','yday','month','PrecipitationIn','_CloudCover','_Max_Gust_SpeedMPH','_Events']
    dat = dat[vars_to_keep]
    dat.rename(columns={'Max_TemperatureF':'max_temp','Min_TemperatureF':'min_temp','Mean_TemperatureF':'mean_temp','PrecipitationIn':'precip_In','_CloudCover':
        'cloud_cover','_Max_Gust_SpeedMPH':'max_gust_mph','_Events':'events'},inplace=True)    
    
    return dat



In [2]:
# test function
dat = get_wea_data_yearly(2000,'KDEN')
dat.head()
dat.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 271 entries, 2000-01-01 to 2000-12-31
Data columns (total 12 columns):
date            271 non-null datetime64[ns]
st_code         271 non-null object
max_temp        271 non-null int64
min_temp        271 non-null int64
mean_temp       269 non-null float64
year            271 non-null int64
yday            271 non-null int64
month           271 non-null int64
precip_In       271 non-null float64
cloud_cover     271 non-null int64
max_gust_mph    152 non-null float64
events          82 non-null object
dtypes: datetime64[ns](1), float64(3), int64(6), object(2)
memory usage: 27.5+ KB


In [3]:
sta_list = pd.read_csv('USAirportWeatherStations.csv')
sta_list.head()

Unnamed: 0.1,Unnamed: 0,Station,State,airportCode,Lat,Lon,Elevation,WMO
0,1,Central,AK,PARL,65.57,-144.8,292,99999
1,2,Atka,AK,PAAK,52.22,-174.2,17,99999
2,3,Buckland,AK,PABL,65.99,-161.12,0,99999
3,4,Portage Glacier,AK,PATO,60.79,-148.83,29,99999
4,5,Nivalina,AK,PAVL,67.73,-164.55,3,70148


In [22]:
#sta_list[sta_list.airportCode=="PAVW"]
stcodes = sta_list.airportCode.values
stcodes[106]
stcodes = stcodes[106::]
type(stcodes)
stcodes[0]
len(stcodes)

1496

In [4]:
# Write to databse

# start at PAVW

con = sqlite3.connect('wunderground_daily.db')
years = range(1950,2017)
for sta in stcodes:
    print('getting data for ' + sta )
    for year in years:
        #print('getting data for ' + sta + ' for ' + str(year))
        try:
            dat = get_wea_data_yearly(year,sta)
            if (dat.shape[0]!=0):
                dat.to_sql("wea",con,if_exists='append',index=False)
                # write to S3 also
                s3 = boto3.resource('s3')
                #fname = csv_name
                key_name = sta + '_' + str(year) + '.csv'
                #data = open(fname, 'rb')
                data = dat.to_csv(None,index=False)
                s3.Bucket('wundergrounddaily').put_object(Key=key_name, Body=data)
        except:
            pass

            del dat


getting data for PARL
getting data for PAAK
getting data for PABL
getting data for PATO
getting data for PAVL
getting data for PAGL
getting data for PAKV
getting data for PAII
getting data for PAKK
getting data for PALR
getting data for PARD
getting data for PABV
getting data for PADK
getting data for PAWN
getting data for PADQ
getting data for PANC
getting data for PANT
getting data for PABR
getting data for PACD
getting data for PADL
getting data for PAFA
getting data for PAGB
getting data for PAHO
getting data for PAJN
getting data for PAMC
getting data for PAMY
getting data for PAKU
getting data for PAOM
getting data for PAOT
getting data for PASN
getting data for PASO
getting data for PASP
getting data for PATK
getting data for PAUN
getting data for PAPO
getting data for PAVD
getting data for PAYA
getting data for PAFM
getting data for PAKN
getting data for PALU
getting data for PASY
getting data for PANI
getting data for PABT
getting data for PADU
getting data for PFYU
getting da

ValueError: year 4461284 is out of range

In [None]:
dat