In [139]:
%matplotlib notebook
import pandas as pd


In [140]:
df = pd.read_csv('data/zika/cdc_zika.csv', sep=',')
df = df[df.location.map(lambda s: isinstance(s, str))]

target_fields = pd.read_csv('data/zika/target_data_fields.txt')
target_fields = set(target_fields.as_matrix().flatten())

reports = df[df['data_field'].map(
    lambda s: s in target_fields)]
print('n. reported cases {}'.format(len(reports)))

unique_places = reports.location.unique()
print('n. unique places {}'.format(len(unique_places)))

country_locations = {p for p in unique_places if len(p.split('-')) == 1}
countries = {p.split('-')[0] for p in unique_places if len(p.split('-')) > 1}
locations_to_remove = country_locations.intersection(countries)
reports = reports[reports.location.map(lambda l: l not in locations_to_remove)]
print('after removing country locations, n. reported cases {}'.format(len(reports)))

n. reported cases 5221
n. unique places 416
after removing country locations, n. reported cases 4845


  interactivity=interactivity, compiler=compiler, result=result)


In [141]:
names = ['Airport ID', 'Name', 'City', 'Country', 'IATA', 'ICAO', 'Latitude', 'Longitude', 'Altitude', 'Timezone', 'DST', 'Tz', 'Type', 'Source']
airports = pd.read_csv('data/zika/airports.dat', sep=',', names=names)

In [142]:
def map_location_to_airports(location, airports):
    segs = location.split('-')
    def replace_underscore(s):
        return s.replace('_', ' ')
    if len(segs) > 1:        
        val = replace_underscore(segs[1])
        field = 'City'
    else:
        val = replace_underscore(segs[0])
        field = 'Country'
    result = airports[airports[field] == val]    
    airport_codes = set(result['IATA'].as_matrix())    
    return sorted(set(filter(lambda s: s != '\\N', airport_codes)))

In [143]:
unique_locations = reports.location.unique()
loc2airports = {}
airport2loc = {}
for loc in unique_locations:
    airport_codes = map_location_to_airports(loc, airports)
    loc2airports[loc] = airport_codes    
    for code in airport_codes:
        airport2loc[code] = loc

In [144]:
airports_to_merge = [codes for codes in loc2airports.values() if len(codes) > 1]
loc2airport = {l: codes[0] for l, codes in loc2airports.items() if len(codes) > 0}
# airports_to_merge

In [145]:
import networkx as nx
# now construct location network
names = ['Airline', 'Airline ID',
         'src','src_id', 'des', 'des_id', 
         'Codeshare', 'Stops', 'Equipment']
routes = pd.read_csv('data/zika/routes.dat', sep=',', names=names)

routes.head(2)

Unnamed: 0,Airline,Airline ID,src,src_id,des,des_id,Codeshare,Stops,Equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2


In [146]:
# merge the names
for codes in airports_to_merge:
    head = codes[0]
    for tail in codes[1:]:
        routes.loc[routes['src'] == tail, 'src'] = head  # use loc for inplace change
        routes.loc[routes['des'] == tail, 'des'] = head
    

In [159]:
from graph_tool import Graph
from tqdm import tqdm
g = Graph(directed=True)
all_airports = sorted(set(routes.src.unique()) | set(routes.des.unique()))
for _ in all_airports:
    g.add_vertex()
code = g.new_vertex_property('string')
lonlat = g.new_vertex_property('vector<float>')
for i, a in enumerate(all_airports):
    code[i] = a
    try:
        air = airports[airports['IATA'] == a].iloc[0]
        lonlat[i] = (air['Longitude'], air['Latitude'])    
    except IndexError:
        # print('ignore {}'.format(a))
        pass
code2node = {a: i for i, a in enumerate(all_airports)}
g.vertex_properties['code'] = code
g.vertex_properties['lonlat'] = lonlat

all_edges = set()
for i, r in tqdm(routes.iterrows()):
    u, v = code2node[r['src']], code2node[r['des']]
    all_edges.add((u, v))
    
for u, v in tqdm(all_edges):
    g.add_edge(u, v)
g.save('data/zika/graph.gt')

67663it [00:10, 6579.10it/s]
100%|██████████| 37307/37307 [00:03<00:00, 11714.05it/s]


In [100]:
# filter out city without airport
print('before, #reports {}'.format(len(reports)))
reports = reports[reports.location.map(lambda l: l in loc2airport and loc2airport[l] in code2node)]
print('after, #reports {}'.format(len(reports)))

before, #reports 895
after, #reports 773


In [101]:
report_node_id = [code2node[loc2airport[l]] for l in reports['location']]
reports['airport_node'] = report_node_id

In [119]:
location_by_first_report = reports[['location', 'report_date']].groupby('location').agg({'report_date': np.min})

In [131]:
loc2first_report = location_by_first_report['report_date'].to_dict()
first_reports = []
for l, date in loc2first_report.items():
    first_reports.append((date, l, code2node[loc2airport[l]]))
first_reports = pd.DataFrame(first_reports, columns=['date', 'location', 'node_id'])

In [133]:
first_reports.to_csv('data/zika/cascades.csv', sep=',')

In [162]:
first_reports.shape

(56, 3)