In [1]:
# %load /Users/Andy/jupyter_imports.py
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# make plots look nice
plt.rcParams['font.size'] = 14
plt.rcParams['axes.labelsize'] = 'large'
plt.rcParams['xtick.labelsize'] = 'large'
plt.rcParams['ytick.labelsize'] = 'large'
plt.rcParams['lines.linewidth'] = 3

import sqlite3
con = sqlite3.connect('/Users/Andy/Projects/wunderground_data/wunderground_daily.db')

In [2]:

def clean_data(dat):

    dat.date = pd.to_datetime(dat.date)

    Tmin = -100
    Tmax =  150

    dat.where(dat.mean_temp>Tmin, inplace=True)
    dat.where(dat.mean_temp<Tmax, inplace=True)

    dat.where(dat.min_temp>Tmin, inplace=True)
    dat.where(dat.min_temp<Tmax, inplace=True)

    dat.where(dat.max_temp>Tmin, inplace=True)
    dat.where(dat.max_temp<Tmax, inplace=True)

    dat.where(dat.max_temp > dat.min_temp, inplace=True)

    dat.drop_duplicates(inplace=True)
    
    # convert 'Trace' precip to 0.01
    # I want to preserve the fact that there was rain, but very small amount
    # Also, sometimes precip_In is read in as float, not object? (maybe if it contains no 'T'?)
    if dat.precip_In.dtype=='object':
        dat['precip_In'][dat['precip_In']=='T'] = '0.01'
        dat.precip_In = pd.to_numeric(dat.precip_In)
    
    # some precip values are way too large
    # According to weather.com :The most extreme 24-hour rainfall total on record 
    # in the U.S. is 42.0 inches near Alvin, Texas, between 7 a.m. July 25 and 7 a.m.
    # July 26, 1979.
    # I will keep values < 50
    dat.where(dat.precip_In<50, inplace=True)
    
    # screen unreasonably high winds
    dat.where(dat.max_gust_mph<300, inplace=True)
    
    return dat


In [3]:
sta_df = pd.read_csv('USAirportWeatherStations.csv')
sta_df.head()

Unnamed: 0.1,Unnamed: 0,Station,State,airportCode,Lat,Lon,Elevation,WMO
0,1,Central,AK,PARL,65.57,-144.8,292,99999
1,2,Atka,AK,PAAK,52.22,-174.2,17,99999
2,3,Buckland,AK,PABL,65.99,-161.12,0,99999
3,4,Portage Glacier,AK,PATO,60.79,-148.83,29,99999
4,5,Nivalina,AK,PAVL,67.73,-164.55,3,70148


In [4]:
st_list = sta_df['airportCode'].values
st_list
sta_df[sta_df.airportCode=='KUCY']


Unnamed: 0.1,Unnamed: 0,Station,State,airportCode,Lat,Lon,Elevation,WMO
1282,1283,Union City,TN,KUCY,36.38,-88.99,104,99999


In [5]:
st_list = st_list[1284:]
st_list[0]

'KM08'

In [6]:
st_list

array(['KM08', 'KM52', 'KSNH', 'KBNA', 'KCHA', 'KCSV', 'KDYR', 'KMEM',
       'KMKL', 'KMQY', 'KNQA', 'KTRI', 'KTYS', 'KOQT', 'K11R', 'KGPM',
       'K6R6', 'KTKI', 'KGKY', 'KCRS', 'KTRL', 'KBAZ', 'KABI', 'KACT',
       'KADS', 'KAFW', 'KALI', 'KAMA', 'KATT', 'KAUS', 'KBGD', 'KBPT',
       'KBRO', 'KBMQ', 'KCDS', 'KCLL', 'KCOT', 'KCRP', 'KCXO', 'KDAL',
       'KDFW', 'KDHT', 'KDLF', 'KDRT', 'KDTO', 'KGTU', 'KDWH', 'KLVJ',
       'KDYS', 'KEFD', 'KELP', 'KF39', 'KFST', 'KFTW', 'KGDP', 'KGGG',
       'KGLS', 'KGRK', 'KGVT', 'KHDO', 'KHLR', 'KHOU', 'KHRL', 'KIAH',
       'KILE', 'KINK', 'KJCT', 'KLBB', 'KLBX', 'KLFK', 'KLRD', 'KMAF',
       'KMFE', 'KMRF', 'KMWL', 'KNFW', 'KNGP', 'KNQI', 'KOCH', 'KPRX',
       'KPSX', 'KPWG', 'KRBD', 'KRKP', 'KRND', 'KRPE', 'KSAT', 'KHYI',
       'KSEP', 'KSGR', 'KSJT', 'KSKF', 'KSPS', 'KSSF', 'KTPL', 'KTYR',
       'KUTS', 'KVCT', 'KODO', 'KPIL', 'K4BL', 'K4HV', 'KBCE', 'KCDC',
       'KENV', 'KHIF', 'KLGU', 'KMLF', 'KCNY', 'KOGD', 'KPUC', 'KPVU',
      

In [None]:

for sta in st_list:
    print('cleaning ' + sta)
    try:
        dat = pd.read_sql_query("SELECT * FROM wea WHERE st_code=? ",con,params=[sta])
        #dat.head()
        dat = clean_data(dat)
        #dat.to_csv('cleaned/' + sta + '_cleaned.csv',index=False)

        # write to S3
        s3 = boto3.resource('s3')
        #fname = csv_name
        key_name = 'cleaned/' + sta + '_cleaned.csv'
        #data = open(fname, 'rb')
        data = dat.to_csv(None,index=False)
        s3.Bucket('wundergrounddaily').put_object(Key=key_name, Body=data)

        # write to 'clean' database table
        dat.to_sql("wea_clean",con,if_exists='append',index=False)
        del dat
    except:
        pass
    
    

cleaning KM08
cleaning KM52
