# Packages

In [1]:
import pandas as pd
import numpy as np
import os
import time
import requests
from bs4 import BeautifulSoup
from datetime import timedelta

# Directory to Store Data

In [2]:
SUBDIR = 'data/'

# Code for Scraping

In [7]:
class Code(object):
    """Class for determining if an airport or station code is legitimate"""
    def __init__(self,length):
        self.length = length
        
    def is_legit_code(self,code):
        return (not code[0].isdigit()) and (len(code) == self.length)

class StationCodesFile(object):
    """Class for reading airport and station codes and producing the corresponding mapping between the two"""
    def __init__(self,filename,headerlines,startcol,
                 subdir=SUBDIR,airportcode=None,stationcode=None):
        self.filepath = os.path.join(subdir,filename)
        self.headerlines = headerlines
        self.startcol = startcol
        self.airportcode = Code(3) if airportcode is None else airportcode # airport codes are three chars long
        self.stationcode = Code(4) if stationcode is None else stationcode # weather station codes are four chars
        
    def yield_station_codes(self):
        with open(self.filepath,'r') as f:
            for _ in range(self.headerlines):
                f.next()
            for line in f:
                line = line[self.startcol:]
                words = line.split()
                if len(words) >= 2:
                    airport, station = words[1], words[0]
                    if self.airportcode.is_legit_code(airport) and self.stationcode.is_legit_code(station):
                        yield airport, station
                        
    def code_mapping(self):
        return {airport: station for airport, station in self.yield_station_codes()}
    
class WundergroundScraper(object):
    """Class for scraping weather from wunderground.com"""
    def __init__(self,stationdict):
        self.url_template = 'https://www.wunderground.com/history/airport/{0}/{1}/{2}/{3}/DailyHistory.html'
        self.funcdict = {'TEMP': self.get_actual_temp,
                         'PRECIP': self.get_actual_precip}
        self.stationdict = stationdict
        
    def get_station(self,airport):
        """Get weather station corresponding to an airport"""
        if airport in self.stationdict:
            return self.stationdict[airport]
        else:
            print 'Bad airport'
            raise ValueError
        
    def scrape_from_date_and_airport(self,date,airport,attrs):
        """Scrape data for a specific date and location (airport)"""
        station = self.get_station(airport)
        soup = self.soupify_date_station(date,station)
        res = {}
        for attr in attrs:
            func = self.funcdict[attr]
            res[attr] = func(soup)
        return res

    def soupify_date_station(self,date,station):
        """Get beautiful soup representation of weather"""
        url = self.url_template.format(station,date.year,date.month,date.day)
        r = requests.get(url)
        soup = BeautifulSoup(r.text,'html.parser')
        return soup
    
    def extract_tagged_value(self,soup,tag,idxcol):
        """Extract a tagged value from a Wunderground table"""
        for tr in soup.find_all('tr'):
            if tr.find('td',text=tag) is not None:
                vals = tr.find_all('span',{'class': 'wx-value'})
                if vals:
                    return vals[idxcol].contents[0]
    
    def get_actual_temp(self,soup,idxcol=0):
        """Get the mean, actual temperature from the soup"""
        temp = self.extract_tagged_value(soup,'Mean Temperature',idxcol)
        try:
            return float(temp)
        except (ValueError, TypeError):
            return None
            
    def get_actual_precip(self,soup,idxcol=0):
        """Get the actual precipitation from the soup"""
        precip = self.extract_tagged_value(soup,'Precipitation',idxcol)
        try:
            return float(precip)
        except (ValueError, TypeError):
            try:
                if precip.startswith('T'): # trace precipitation
                    return 0.0
                else:
                    return None
            except AttributeError:
                return None

# Scraper Loop

In [36]:
def scrape_all_weather(wscraper,startdate,enddate,airports,attrs,delay=1):
    """Loop over airports and date, scrape the weather for each, and save to csv file periodicially"""
    datecurr = startdate
    res = []
    columns = ['Date','Airport'] + attrs
    while datecurr <= enddate:
        for airport in airports:
            try:
                rescurr = wscraper.scrape_from_date_and_airport(datecurr,airport,attrs)
            except ValueError:
                print "Failed for airport {0} and date {1}".format(airport,datecurr)
                rescurr = {attr: None for attr in attrs}
            res.append([datecurr,airport] + [rescurr[attr] for attr in attrs])
            time.sleep(delay)
        datecurr += timedelta(days=1)
    return pd.DataFrame(res,columns=columns)

# Run Scraper Loop

In [37]:
stationcodes = StationCodesFile(filename='station_codes.txt',headerlines=41,startcol=20)
stationdict = stationcodes.code_mapping()
wscraper = WundergroundScraper(stationdict)

In [39]:
unique_airports = np.load(os.path.join(SUBDIR,'airports.npy')) # load all the unique airports in the flight data
dfweather = scrape_all_weather(wscraper=wscraper,
                               startdate = pd.to_datetime('11/01/2013'),
                               enddate = pd.to_datetime('11/30/2013'),
                               airports = unique_airports,
                               attrs = ['TEMP','PRECIP'],
                               delay = 0.1) # scrape each of them, from Nov. 1 - Nov. 30
dfweather.to_csv(os.path.join(SUBDIR,'weather.csv'),encoding='utf-8') # save to csv