# Gathering Influenza Related Data via the DELPHI Epidata API
### Purpose of this notebook
* Gather the Wikipedia Pageviews and ILInet data from the DELPHI epidata API
* Store the data into pandas DataFrames to facilitate analysis (in other notebooks)
* Create a DataFrame that maps Epiweeks to their corresponding Datetimes

*The relevant computed variables are stored in iPython's local data store to avoid recomputation. These variables can be accessed from other notebooks using the %store magic command*

In [1]:
import epidata as delphi
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
from bs4 import BeautifulSoup
from collections import defaultdict
from datetime import datetime, timedelta
%matplotlib inline

%store -r pageViewResps
%store -r wILIresp
%store -r pageViews
%store -r wILI
%store -r epiweeksDf
%store -r rawPageViews

definedVariables = set(vars().keys())

#### Getting Wikipedia Page Views for Flu Related Articles

In [2]:
if 'pageViewResps' not in definedVariables:
    
    print("Calling the DELPHI epidata API for wikipedia pageviews")
    
    epidata = delphi.Epidata() # interface to CMU delphi API

    with open('./data/allarticles.txt') as f:
        fluRelatedArticles = [article.strip() for article in f]
    years = range(2008, 2017) # 2008 - 2016 full years
    epiranges = [ epidata.range(int(str(yr) + '01'), int(str(yr) + '52')) for yr in years]
    pageViewResps = []
    # API calls to the delphi epidata API
    for epiyear in epiranges:
        resp = epidata.wiki(fluRelatedArticles, epiweeks=epiyear)['epidata']
        pageViewResps.extend(resp)
        time.sleep(15)
    %store pageViewResps
else:
    print("Found pageViewResps")
    
pageViewResps[:2]

Found pageViewResps


[{'article': 'amantadine',
  'count': 2201,
  'epiweek': 200801,
  'hour': -1,
  'total': 960694119,
  'value': 2.29105181},
 {'article': 'antiviral_drugs',
  'count': 135,
  'epiweek': 200801,
  'hour': -1,
  'total': 960694119,
  'value': 0.1405234}]

#### Getting state level ILInet data

In [3]:
if 'wILIresp' not in definedVariables:
    
    print("Calling the DELPHI epidata API for ILInet data")
    
    wILIresp = epidata.fluview('nat', epidata.range(200801, 201652))['epidata']
    %store wILIresp
else:
    print("Found wILIresp")

wILIresp[:2]

Found wILIresp


[{'epiweek': 200801,
  'ili': 2.2540483926029,
  'issue': 201352,
  'lag': 312,
  'num_age_0': 3737,
  'num_age_1': 2568,
  'num_age_2': 2731,
  'num_age_3': None,
  'num_age_4': None,
  'num_age_5': 670,
  'num_ili': 9706,
  'num_patients': 430603,
  'num_providers': 1486,
  'region': 'nat',
  'release_date': '2013-12-31',
  'wili': 2.4393875484528},
 {'epiweek': 200802,
  'ili': 2.0914715379231,
  'issue': 201352,
  'lag': 311,
  'num_age_0': 3393,
  'num_age_1': 3644,
  'num_age_2': 3262,
  'num_age_3': None,
  'num_age_4': None,
  'num_age_5': 816,
  'num_ili': 11115,
  'num_patients': 531444,
  'num_providers': 1509,
  'region': 'nat',
  'release_date': '2013-12-31',
  'wili': 2.2931912803177}]

#### Putting pageViews API response data in DataFrame

In [4]:
if 'pageViews' not in definedVariables:
    pageToViews = defaultdict(list)
    pageViewsIndex = { week['epiweek'] for week in pageViewResps }
    pageViewsIndex = list(pageViewsIndex)
    pageViewsIndex.sort()

    # map each article to it's weekly view counts (from 2008 to 2016)
    for week in pageViewResps:
        page, weeklyViews = week['article'], week['count']
        pageToViews[page].append(weeklyViews)

    pageViews = pd.DataFrame.from_dict(pageToViews, orient='index', dtype='int')
    pageViews.fillna(0)
    pageViews = pageViews.transpose()
    # convert to ints, for some reasons transpose() coereces to floats
    for column in pageViews.columns:
        pageViews[column] = pageViews[column].fillna(0.0).astype('int')
    pageViews.index = pageViewsIndex
    pageViews[:2]
    
    %store pageViews
else:
    print("Found pageViews")

pageViews[:5]

Found pageViews


Unnamed: 0,influenza_pandemic,cough,nasal_congestion,fatigue_(medical),hemagglutinin_(influenza),chills,influenza_a_virus_subtype_h5n1,influenza_a_virus_subtype_h7n7,human_flu,influenza_a_virus_subtype_h2n2,...,neuraminidase_inhibitor,influenza_a_virus_subtype_h7n9,shivering,oseltamivir,influenza_prevention,cat_flu,common_cold,orthomyxoviridae,viral_neuraminidase,influenza_a_virus_subtype_h1n2
200801,1030,7600,3981,957,0,777,218,6,350,155,...,362,0,1251,1124,0,319,19240,565,0,14
200802,1642,8972,3722,1045,0,858,304,12,453,205,...,461,0,1299,1501,0,339,21084,885,0,19
200803,1525,8435,3424,960,0,664,307,16,534,96,...,475,0,1274,1643,0,291,19131,844,0,22
200804,1502,8913,3469,440,0,755,464,12,577,41,...,520,0,1461,2151,0,298,19602,873,0,10
200805,1596,8828,3698,1174,0,1031,1006,9,735,47,...,636,0,1356,2936,0,354,20667,1040,0,12


#### Putting ILInet API response in DataFrame

In [5]:
if 'wILI' not in definedVariables:
    wILIvalues = [ week['ili'] for week in wILIresp ]
    wILIindex = [ week['epiweek'] for week in wILIresp ]
    wILIindex.sort()
    wILI = pd.DataFrame(wILIvalues, columns=['Weekly ILI'], index=wILIindex)
    wILI.drop([200853, 201453], inplace=True) # these epiweeks aren't in pageViews
    
    %store wILI
else:
    print("Found wILI")
wILI[:5]

Found wILI


Unnamed: 0,Weekly ILI
200801,2.254048
200802,2.091472
200803,2.359343
200804,3.323314
200805,4.43381


#### Creating DataFrame that maps Epiweek number to a datetime
This will prove useful when doing timeseries analysis, as dealing with epiweeks (e.g. 200840) isn't ideal. This data is taken from 
> https://ibis.health.state.nm.us/resource/MMWRWeekCalendar.html

Instead of sending a GET request, I have pasted the source html in the data folder.

In [6]:
if 'epiweeksDf' not in definedVariables:
    
    print("Creating epiweeks dataframe")
    
    with open("data/epiweeks.html") as f:
        html = f.read().replace('\n', '')
        soup = BeautifulSoup( html, 'lxml' )
        tables = soup.findAll("table", {'class':'Info'})

    epiweeksDf = pd.DataFrame()

    for table in tables[::-1]:
        rows = iter(table.findAll('tr'))
        next(rows) # skip header
        years = [int(year) for year in next(rows).text.split()]
        df = pd.DataFrame(columns=years)
        for i, row in enumerate(rows):
            weeks = [ datetime.strptime(d, '%m/%d/%Y') for d in row.text.split()[1:] ]
            if len(weeks) == 5:
                df.loc[i+1] = weeks
        epiweeksDf = pd.concat([epiweeksDf, df], axis=1)
    %store epiweeksDf

else:
    print("Found epiweeksDf")

epiweeksDf[:5] 

Found epiweeksDf


Unnamed: 0,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
1,2006-01-07,2007-01-06,2008-01-05,2009-01-10,2010-01-09,2011-01-08,2012-01-07,2013-01-05,2014-01-04,2015-01-10,2016-01-09,2017-01-07,2018-01-06,2019-01-05,2020-01-04
2,2006-01-14,2007-01-13,2008-01-12,2009-01-17,2010-01-16,2011-01-15,2012-01-14,2013-01-12,2014-01-11,2015-01-17,2016-01-16,2017-01-14,2018-01-13,2019-01-12,2020-01-11
3,2006-01-21,2007-01-20,2008-01-19,2009-01-24,2010-01-23,2011-01-22,2012-01-21,2013-01-19,2014-01-18,2015-01-24,2016-01-23,2017-01-21,2018-01-20,2019-01-19,2020-01-18
4,2006-01-28,2007-01-27,2008-01-26,2009-01-31,2010-01-30,2011-01-29,2012-01-28,2013-01-26,2014-01-25,2015-01-31,2016-01-30,2017-01-28,2018-01-27,2019-01-26,2020-01-25
5,2006-02-04,2007-02-03,2008-02-02,2009-02-07,2010-02-06,2011-02-05,2012-02-04,2013-02-02,2014-02-01,2015-02-07,2016-02-06,2017-02-04,2018-02-03,2019-02-02,2020-02-01


### Aligning and aggregating QUAC pageviews to epiweeks

In [7]:
if 'rawPageViews' not in definedVariables:
    rawHourlyPageViews = pd.read_csv('data/LANL/en_flu_raw.csv')
    rawHourlyPageViews = rawHourlyPageViews.set_index('timestamp')
    rawHourlyPageViews.index = rawHourlyPageViews.index.to_datetime()

    rawPageViews = pd.DataFrame(columns=rawHourlyPageViews.columns)

    for year in range(2008, 2015):
        for epiweekNum, end in enumerate(epiweeksDf[year]):
            start = end - timedelta(days=6)
            end += timedelta(hours=23)
            mask = ((rawHourlyPageViews.index >= start) & 
                    (rawHourlyPageViews.index <= end))
            # aggregate on epiweeks
            thisEpiweekPageViews = rawHourlyPageViews.loc[mask].sum()
            epiweekNum = str(epiweekNum) if epiweekNum >= 10 else '0' + str(epiweekNum)
            epiweekWithYear = int(str(year) + epiweekNum) + 1
            rawPageViews.loc[epiweekWithYear] = thisEpiweekPageViews.values
    # coerce to ints
    for col in rawPageViews.columns:
        rawPageViews[col] = rawPageViews[col].astype('int')
    # convert column names to make them the same as the pageViews df
    rawPageViews.columns = [ article[3:].lower() if len(article) >= 3 else article
                             for article in rawPageViews.columns ]
    rawPageViews = rawPageViews.loc[rawPageViews.index <= 201409]
    
    %store rawPageViews
else:
    print("Found rawPageViews")

rawPageViews[:5]

Found rawPageViews


Unnamed: 0,amantadine,antiviral_drugs,avian_influenza,canine_influenza,cat_flu,chills,common_cold,cough,equine_influenza,fatigue_(medical),...,rhinorrhea,rimantadine,shivering,sore_throat,swine_influenza,viral_neuraminidase,viral_pneumonia,vomiting,zanamivir,en
200801,2183,133,3254,196,306,773,18598,7567,183,957,...,1510,303,1246,2526,17,0,1221,8868,444,960694119
200802,2737,184,4823,222,325,853,20483,8936,206,1044,...,1820,364,1293,2711,21,0,1482,9759,591,1093556098
200803,2734,193,5433,199,286,660,18529,8395,260,960,...,1749,357,1271,2585,16,0,1526,10937,632,1150141794
200804,3073,171,6451,197,291,750,18929,8882,292,439,...,1783,453,1456,2658,24,0,1531,10247,697,1133601045
200805,3348,202,6714,215,350,1028,20128,8783,224,1174,...,1706,491,1354,2859,38,0,1653,10097,782,1146850974


In [8]:
QUACepiweeks, DELPHIepiweeks = set(rawPageViews.index), set(pageViews.index)
commonEpiweeks = list( QUACepiweeks.intersection(DELPHIepiweeks) )
commonEpiweeks.sort()

QUACarticles, DELPHIarticles = set(rawPageViews.columns), set(pageViews.columns)
commonArticles = QUACarticles.intersection(DELPHIarticles)

diff= rawPageViews.loc[commonEpiweeks, commonArticles] - pageViews.loc[commonEpiweeks, commonArticles]
diff[['influenza', 'influenza_a_virus', 'flu_season', 'influenza_vaccine', 
      'common_cold', 'influenza_a_virus_subtype_h1n2']][:5]

Unnamed: 0,influenza,influenza_a_virus,flu_season,influenza_vaccine,common_cold,influenza_a_virus_subtype_h1n2
200801,-43,-6,-2,-3,-642,0
200802,-60,-10,0,-2,-601,0
200803,-58,-7,0,-1,-602,0
200804,-62,-6,0,-2,-673,0
200805,-63,-3,-1,-3,-539,0
