In [98]:
from math import radians, cos, sin, asin, sqrt
import pandas as pd
import numpy as np
import requests

# Import, lowercase city/state name to attempt join later

# Geonames is ~1:1 City/Place names to Zip Codes
# Schema: http://download.geonames.org/export/zip/readme.txt
col_names = ['country_code',
             'postal_code',
             'place_name',
             'admin_name1',
             'admin_code1',
             'admin_name2',
             'admin_code2',
             'admin_name3',
             'admin_code3',
             'latitude',
             'longitude',
             'accuracy']
geonames = pd.read_table('GeoNamesZip.txt', header=None, names=col_names, dtype={'postal_code': str})
geonames['place_name'] = geonames['place_name'].str.lower()
geonames['admin_code1'] = geonames['admin_code1'].str.lower()
geonames_set = set(geonames['postal_code'])

# Free-zip 1 is 1:1 City to Zip Code
free_zip_db = pd.read_csv('free-zipcode-database-Primary.csv', dtype={'Zipcode': str})
free_zip_db['City'] = free_zip_db['City'].str.lower()
free_zip_db['State'] = free_zip_db['State'].str.lower()
freezip_set = set(free_zip_db['Zipcode'])

# Free-zip all is Many:1 City to Zip Code
free_zip_all = pd.read_csv('free-zipcode-database-all-places.csv', dtype={'Zipcode': str})
# We only want Primary/Acceptable locs. There is some junk here.
free_zip_all = free_zip_all[free_zip_all['LocationType'].isin(['ACCEPTABLE', 'PRIMARY'])]
free_zip_all['City'] = free_zip_all['City'].str.lower()
free_zip_all['State'] = free_zip_all['State'].str.lower()
freezip_all_set = set(free_zip_all['Zipcode'])

# Noncensus is ~1:1 City to Zip Code
noncensus = pd.read_csv('noncensus_zip.csv', dtype={'zip': str})
noncensus['city'] = noncensus['city'].str.lower()
noncensus['state'] = noncensus['state'].str.lower()
noncensus_set = set(noncensus['zip'])

# How many unique zip codes do we have in each dataset?
set_lengths = {
    'noncensus_set': len(noncensus_set),
    'free_zip_set': len(freezip_set),
    'free_zip_all_set': len(freezip_all_set),
    'geonames_set': len(geonames_set)
    }
set_lengths

{'free_zip_all_set': 42522,
 'free_zip_set': 42522,
 'geonames_set': 43586,
 'noncensus_set': 43524}

In [99]:
# How many total rows do we have (including duplicate zips)
len(geonames), len(free_zip_db), len(free_zip_all), len(noncensus)

(43629, 42522, 56725, 43524)

In [100]:
# Just use columns of interest, rename for specificity
geo_zip_place_loc = geonames[['postal_code', 'place_name', 'admin_code1',
                              'latitude', 'longitude']]
geo_zip_place_loc = geo_zip_place_loc.rename(columns={'postal_code': 'Zip_geo',
                                                      'latitude': 'Lat_geo', 
                                                      'longitude': 'Lon_geo',
                                                      'place_name': 'City_geo',
                                                      'admin_code1': 'State_geo'})

free_zip_all_place_loc = free_zip_all[['Zipcode', 'City', 'State', 'Lat', 'Long']]
free_zip_all_place_loc = free_zip_all_place_loc.rename(columns={'Zipcode': 'Zip_free',
                                                                'Lat': 'Lat_free',
                                                                'Long': 'Lon_free',
                                                                'City': 'City_free',
                                                                'State': 'State_free'})
noncensus_place_loc = noncensus[['zip', 'city', 'state', 'latitude', 'longitude']]
noncensus_place_loc = noncensus_place_loc.rename(columns={'zip': 'Zip_non',
                                                          'latitude': 'Lat_non', 
                                                          'longitude': 'Lon_non',
                                                          'city': 'City_non',
                                                          'state': 'State_non'})

Given that geonames contains the most unique zip codes, we're going to use it as the common join table between the other two.

In [141]:
# Join free-zip and geonames, right outer because we trust geonames a bit more
merged_free_geo_zip = pd.merge(free_zip_all_place_loc, geo_zip_place_loc, 
                               left_on='Zip_free', right_on='Zip_geo',
                               how='right')
merged_free_geo_zip_city = pd.merge(free_zip_all_place_loc, geo_zip_place_loc, 
                                    left_on=['Zip_free', 'City_free'], 
                                    right_on=['Zip_geo', 'City_geo'],
                                    how='right')
merged_free_geo_zip_state = pd.merge(free_zip_all_place_loc, geo_zip_place_loc, 
                                     left_on=['Zip_free', 'State_free'], 
                                     right_on=['Zip_geo', 'State_geo'],
                                     how='right')
merged_free_geo_zip_city_state = pd.merge(free_zip_all_place_loc, geo_zip_place_loc, 
                                          left_on=['Zip_free', 'City_free', 'State_free'], 
                                          right_on=['Zip_geo', 'City_geo', 'State_geo'],
                                          how='right')
# Based on the join keys, how many resulting rows do we have? 
lengths_free_geo = {
    'merged_zip': len(merged_free_geo_zip),
    'merged_zip_city': len(merged_free_geo_zip_city),
    'merged_zip_state': len(merged_free_geo_zip_state),
    'merged_zip_city_state': len(merged_free_geo_zip_city_state)
}
lengths_free_geo

{'merged_zip': 57755,
 'merged_zip_city': 43629,
 'merged_zip_city_state': 43629,
 'merged_zip_state': 57755}

In [132]:
merged_free_geo_zip.head()

Unnamed: 0,Zip_free,City_free,State_free,Lat_free,Lon_free,Zip_geo,City_geo,State_geo,Lat_geo,Lon_geo
0,7675,westwood,nj,40.98,-74.03,7675,westwood,nj,41.0092,-74.0041
1,7675,old tappan,nj,40.98,-74.03,7675,westwood,nj,41.0092,-74.0041
2,7675,river vale,nj,40.98,-74.03,7675,westwood,nj,41.0092,-74.0041
3,7675,rivervale,nj,40.98,-74.03,7675,westwood,nj,41.0092,-74.0041
4,7677,woodcliff lake,nj,41.02,-74.05,7677,woodcliff lake,nj,41.0234,-74.0603


In [140]:
# Join geonames and noncensus, right outer on noncensus because we trust it more
merged_non_geo_zip = pd.merge(geo_zip_place_loc, noncensus_place_loc,
                              left_on=['Zip_geo'], right_on=['Zip_non'],
                              how='left')
merged_non_geo_zip_city = pd.merge(geo_zip_place_loc, noncensus_place_loc,
                                   left_on=['Zip_geo', 'City_geo'], 
                                   right_on=['Zip_non', 'City_non'],
                                   how='left')
merged_non_geo_zip_state = pd.merge(geo_zip_place_loc, noncensus_place_loc,
                                    left_on=['Zip_geo', 'State_geo'], 
                                    right_on=['Zip_non', 'State_non'],
                                    how='left')
merged_non_geo_zip_city_state = pd.merge(geo_zip_place_loc, noncensus_place_loc,
                                         left_on=['Zip_geo', 'City_geo', 'State_geo'], 
                                         right_on=['Zip_non', 'City_non', 'State_non'],
                                         how='left')
lengths_non_geo = {
    'merged_zip': len(merged_non_geo_zip),
    'merged_zip_city': len(merged_non_geo_zip_city),
    'merged_zip_state': len(merged_non_geo_zip_state),
    'merged_zip_city_state': len(merged_non_geo_zip_city_state)
}
lengths_non_geo

{'merged_zip': 43629,
 'merged_zip_city': 43629,
 'merged_zip_city_state': 43629,
 'merged_zip_state': 43629}

In [137]:
merged_non_free_zip.head()

Unnamed: 0,Zip_geo,City_geo,State_geo,Lat_geo,Lon_geo,Zip_non,City_non,State_non,Lat_non,Lon_non
0,34050,fpo,aa,41.0375,-111.6789,,,,,
1,34034,apo,aa,33.0364,-82.2493,,,,,
2,99553,akutan,ak,54.143,-165.7854,99553.0,akutan,ak,54.098693,-165.88176
3,99571,cold bay,ak,55.3976,-162.4206,99571.0,cold bay,ak,55.315003,-162.634
4,99583,false pass,ak,54.841,-163.4368,99583.0,false pass,ak,54.849999,-163.42011


In [154]:
# Join the previous two joins into one another
merged_all = pd.merge(merged_free_geo_zip, merged_non_geo_zip,
                      how='inner',
                      left_on=['Zip_geo','State_geo'], 
                      right_on=['Zip_geo', 'State_geo'])
merged_all

Unnamed: 0,Zip_free,City_free,State_free,Lat_free,Lon_free,Zip_geo,City_geo_x,State_geo,Lat_geo_x,Lon_geo_x,City_geo_y,Lat_geo_y,Lon_geo_y,Zip_non,City_non,State_non,Lat_non,Lon_non
0,07675,westwood,nj,40.98,-74.03,07675,westwood,nj,41.0092,-74.0041,westwood,41.0092,-74.0041,07675,westwood,nj,40.999040,-74.032910
1,07675,old tappan,nj,40.98,-74.03,07675,westwood,nj,41.0092,-74.0041,westwood,41.0092,-74.0041,07675,westwood,nj,40.999040,-74.032910
2,07675,river vale,nj,40.98,-74.03,07675,westwood,nj,41.0092,-74.0041,westwood,41.0092,-74.0041,07675,westwood,nj,40.999040,-74.032910
3,07675,rivervale,nj,40.98,-74.03,07675,westwood,nj,41.0092,-74.0041,westwood,41.0092,-74.0041,07675,westwood,nj,40.999040,-74.032910
4,07677,woodcliff lake,nj,41.02,-74.05,07677,woodcliff lake,nj,41.0234,-74.0603,woodcliff lake,41.0234,-74.0603,07677,woodcliff lake,nj,41.025101,-74.059762
5,07677,westwood,nj,41.02,-74.05,07677,woodcliff lake,nj,41.0234,-74.0603,woodcliff lake,41.0234,-74.0603,07677,woodcliff lake,nj,41.025101,-74.059762
6,07677,woodcliff lk,nj,41.02,-74.05,07677,woodcliff lake,nj,41.0234,-74.0603,woodcliff lake,41.0234,-74.0603,07677,woodcliff lake,nj,41.025101,-74.059762
7,07885,wharton,nj,40.89,-74.58,07885,wharton,nj,40.9139,-74.5863,wharton,40.9139,-74.5863,07885,wharton,nj,40.913298,-74.582460
8,07981,whippany,nj,40.82,-74.41,07981,whippany,nj,40.8219,-74.4200,whippany,40.8219,-74.4200,07981,whippany,nj,40.821482,-74.426480
9,07999,whippany,nj,40.82,-74.41,07999,whippany,nj,40.8673,-74.5783,whippany,40.8673,-74.5783,07999,whippany,nj,40.867331,-74.578269


In [156]:
# Reorder, and we only want to keep one of the Geo join key columns
master_zips = merged_all[['Zip_free', 'Zip_geo', 'Zip_non',
                          'City_free', 'City_geo_x', 'City_non',
                          'State_free', 'State_geo', 'State_non',
                          'Lat_free', 'Lat_geo_x', 'Lat_non',
                          'Lon_free', 'Lon_geo_x', 'Lon_non']]
master_zips = master_zips.rename(columns={'City_geo_x': 'City_geo',
                                          'Lon_geo_x': 'Lon_geo', 
                                          'Lat_geo_x': 'Lat_geo'})
master_zips

Unnamed: 0,Zip_free,Zip_geo,Zip_non,City_free,City_geo,City_non,State_free,State_geo,State_non,Lat_free,Lat_geo,Lat_non,Lon_free,Lon_geo,Lon_non
0,07675,07675,07675,westwood,westwood,westwood,nj,nj,nj,40.98,41.0092,40.999040,-74.03,-74.0041,-74.032910
1,07675,07675,07675,old tappan,westwood,westwood,nj,nj,nj,40.98,41.0092,40.999040,-74.03,-74.0041,-74.032910
2,07675,07675,07675,river vale,westwood,westwood,nj,nj,nj,40.98,41.0092,40.999040,-74.03,-74.0041,-74.032910
3,07675,07675,07675,rivervale,westwood,westwood,nj,nj,nj,40.98,41.0092,40.999040,-74.03,-74.0041,-74.032910
4,07677,07677,07677,woodcliff lake,woodcliff lake,woodcliff lake,nj,nj,nj,41.02,41.0234,41.025101,-74.05,-74.0603,-74.059762
5,07677,07677,07677,westwood,woodcliff lake,woodcliff lake,nj,nj,nj,41.02,41.0234,41.025101,-74.05,-74.0603,-74.059762
6,07677,07677,07677,woodcliff lk,woodcliff lake,woodcliff lake,nj,nj,nj,41.02,41.0234,41.025101,-74.05,-74.0603,-74.059762
7,07885,07885,07885,wharton,wharton,wharton,nj,nj,nj,40.89,40.9139,40.913298,-74.58,-74.5863,-74.582460
8,07981,07981,07981,whippany,whippany,whippany,nj,nj,nj,40.82,40.8219,40.821482,-74.41,-74.4200,-74.426480
9,07999,07999,07999,whippany,whippany,whippany,nj,nj,nj,40.82,40.8673,40.867331,-74.41,-74.5783,-74.578269


In [163]:
# Get a Primary City/State based on dataset agreement. Default to noncensus as "most trusted"
def get_primary(r):
    cities = filter(lambda x: isinstance(x, str),
                    [r['City_free'], r['City_geo'], r['City_non']])
    states = filter(lambda x: isinstance(x, str),
                    [r['State_free'], r['State_geo'], r['State_non']])
    
    def get_most_common(lst):
        cet = set(lst)
        write_val = None
        if len(cet) == 1:
            # If they all agree, use that value
            write_val = cet.pop()
        elif len(cet) == 2:
            # If 2/3 agree, go with it
            write_val = max(cet, key=lst.count)
        else:
            # Default to noncensus for now
            write_val = r['City_non']
        return write_val
    
    r['Primary_City'] = get_most_common(cities)
    r['Primary_State'] = get_most_common(states)
    return r
master_zips = master_zips.apply(get_primary, axis=1)
master_zips.head()

Unnamed: 0,Zip_free,Zip_geo,Zip_non,City_free,City_geo,City_non,State_free,State_geo,State_non,Lat_free,Lat_geo,Lat_non,Lon_free,Lon_geo,Lon_non,Primary_City,Primary_State
0,7675,7675,7675,westwood,westwood,westwood,nj,nj,nj,40.98,41.0092,40.99904,-74.03,-74.0041,-74.03291,westwood,nj
1,7675,7675,7675,old tappan,westwood,westwood,nj,nj,nj,40.98,41.0092,40.99904,-74.03,-74.0041,-74.03291,westwood,nj
2,7675,7675,7675,river vale,westwood,westwood,nj,nj,nj,40.98,41.0092,40.99904,-74.03,-74.0041,-74.03291,westwood,nj
3,7675,7675,7675,rivervale,westwood,westwood,nj,nj,nj,40.98,41.0092,40.99904,-74.03,-74.0041,-74.03291,westwood,nj
4,7677,7677,7677,woodcliff lake,woodcliff lake,woodcliff lake,nj,nj,nj,41.02,41.0234,41.025101,-74.05,-74.0603,-74.059762,woodcliff lake,nj


In [167]:
# Using the TwoFishes geocoder (http://twofishes.net/), try to get better geo accuracy
# In related news, this is a nice way to stress test a geocoder
def get_twofishes(row):
    twofishes_url = 'http://localhost:8081/search/geocode'
    city, state = row['Primary_City'], row['Primary_State']
    if city and state:
        query = '{{"query": "{}, {}"}}'.format(city, state)
        tf_resp = requests.get(twofishes_url, params={'json': query})
        jsond = tf_resp.json()
        if not jsond['interpretations']:
            center = {'lat': np.nan, 'lng': np.nan}
        else:
            center = jsond['interpretations'][0]['feature']['geometry']['center']
        row['Lat_TwoFishes'] = center['lat']
        row['Lon_TwoFishes'] = center['lng']
    else:
        row['Lat_TwoFishes'] = np.nan
        row['Lon_TwoFishes'] = np.nan
    return row

master_zips = master_zips.apply(get_twofishes, axis=1)

In [168]:
master_zips.head(20)

Unnamed: 0,Zip_free,Zip_geo,Zip_non,City_free,City_geo,City_non,State_free,State_geo,State_non,Lat_free,Lat_geo,Lat_non,Lon_free,Lon_geo,Lon_non,Primary_City,Primary_State,Lat_TwoFishes,Lon_TwoFishes
0,7675,7675,7675,westwood,westwood,westwood,nj,nj,nj,40.98,41.0092,40.99904,-74.03,-74.0041,-74.03291,westwood,nj,40.99121,-74.03264
1,7675,7675,7675,old tappan,westwood,westwood,nj,nj,nj,40.98,41.0092,40.99904,-74.03,-74.0041,-74.03291,westwood,nj,40.99121,-74.03264
2,7675,7675,7675,river vale,westwood,westwood,nj,nj,nj,40.98,41.0092,40.99904,-74.03,-74.0041,-74.03291,westwood,nj,40.99121,-74.03264
3,7675,7675,7675,rivervale,westwood,westwood,nj,nj,nj,40.98,41.0092,40.99904,-74.03,-74.0041,-74.03291,westwood,nj,40.99121,-74.03264
4,7677,7677,7677,woodcliff lake,woodcliff lake,woodcliff lake,nj,nj,nj,41.02,41.0234,41.025101,-74.05,-74.0603,-74.059762,woodcliff lake,nj,41.02343,-74.06653
5,7677,7677,7677,westwood,woodcliff lake,woodcliff lake,nj,nj,nj,41.02,41.0234,41.025101,-74.05,-74.0603,-74.059762,woodcliff lake,nj,41.02343,-74.06653
6,7677,7677,7677,woodcliff lk,woodcliff lake,woodcliff lake,nj,nj,nj,41.02,41.0234,41.025101,-74.05,-74.0603,-74.059762,woodcliff lake,nj,41.02343,-74.06653
7,7885,7885,7885,wharton,wharton,wharton,nj,nj,nj,40.89,40.9139,40.913298,-74.58,-74.5863,-74.58246,wharton,nj,40.89315,-74.58183
8,7981,7981,7981,whippany,whippany,whippany,nj,nj,nj,40.82,40.8219,40.821482,-74.41,-74.42,-74.42648,whippany,nj,40.82454,-74.4171
9,7999,7999,7999,whippany,whippany,whippany,nj,nj,nj,40.82,40.8673,40.867331,-74.41,-74.5783,-74.578269,whippany,nj,40.82454,-74.4171


In [None]:
# Let's check the max Haversine distance between points
def check_haversine(row):
    """
    Great circle distance between our Lat/Lon points (dec degrees)
    """
    # Ranking in order of "most trusted" to "least trusted"
    ranks = {1: "TwoFishes", 2: "non", 3: "geo", 4: "free"}
    
    # Convert to Radians   
                           
    rad_lat_1, rad_lon_1 = map(radians, [row['Lat_TwoFishes'], row['Lon_TwoFishes']])
    rad_lat_2, rad_lon_2 = map(radians, [row['Lat_non'], row['Lon_non']])
    rad_lat_3, rad_lon_3 = map(radians, [row['Lat_geo'], row['Lon_geo']])
    rad_lat_4, rad_lon_4 = map(radians, [row['Lat_free'], row['Lon_free']])
    
    
    # 1 vs 2, 1 vs 3, 1 vs 4, 2 vs 3, 2 vs 4, 3 vs 4
    pairs = [{'Pair': 'TwoFishes vs Non',
              'Ranks': [1, 2],
              'Data': (rad_lat_1, rad_lon_1, rad_lat_2, rad_lon_2)},
             {'Pair': 'TwoFishes vs Geo', 
              'Ranks': [1, 3],
              'Data': (rad_lat_1, rad_lon_1, rad_lat_3, rad_lon_3)},
             {'Pair': 'TwoFishes vs. Free',
              'Ranks': [1, 4],
              'Data': (rad_lat_1, rad_lon_1, rad_lat_4, rad_lon_4)},
             {'Pair': 'Non vs. Geo', 
              'Ranks': [2, 3],
              'Data': (rad_lat_2, rad_lon_2, rad_lat_3, rad_lon_3)},
             {'Pair': 'Non vs Free',
              'Ranks': [2, 4],
              'Data': (rad_lat_2, rad_lon_2, rad_lat_4, rad_lon_4)},
             {'Pair': 'Geo vs Free', 
              'Ranks': [3, 4],
              'Data': (rad_lat_3, rad_lon_3, rad_lat_4, rad_lon_4)}]
    
    # Only get pairs that don't have nan vals
    filtered_pairs = filter(lambda x: not any(map(np.isnan, x['Data'])), pairs)

    def get_dist(lat_1, lon_1, lat_2, lon_2):
        under_root = (sin((lat_2 - lat_1)/2)**2 + 
                      cos(lat_1) * cos(lat_2) * sin((lon_2 - lon_1)/2)**2)
        # 6367 = Earth radius in kilometers
        return 6367 * 2 * asin(sqrt(under_root)) 
        
    distanced_pairs = [{'Pair': p['Pair'],
                        'Best_Rank': min(p['Ranks']),
                        'Dist': get_dist(*p['Data'])} for p in filtered_pairs]
    
    # What if we have zero pairs?
    if not distanced_pairs:
        max_dist = {'Dist': np.nan, 'Pair': 'No Pair Found'}
        min_dist = {'Dist': np.nan, 'Pair': 'No Pair Found'}
    else:
        max_dist = max(distanced_pairs, key=lambda x: x['Dist'])
        min_dist = min(distanced_pairs, key=lambda x: x['Dist'])
    
    row['Max_Haversine_Dist'] = max_dist['Dist']
    row['Max_Dist_Pair'] = max_dist['Pair']
    
    row['Min_Haversine_Dist'] = min_dist['Dist']
    row['Min_Dist_Pair'] = min_dist['Pair']
    # If TwoFishes and Non agree closely, go with that data point
    # Otherwise, given the minimum distance set of points, which one of the two points is most trusted?
    fish_non = filter(lambda x: x['Pair'] == 'TwoFishes vs Non', distanced_pairs)
    if fish_non and fish_non[0]['Dist'] < 1.0:
        chosen = fish_non[0]
    else:
        chosen = min_dist
    trusted = ranks[chosen['Best_Rank']]
    trust_lat_nm, trust_lon_nm = 'Lat_' + trusted, 'Lon_' + trusted
    row['Primary_Geo_Source'] = trusted.capitalize()
    row['Lat_primary'] = row[trust_lat_nm]
    row['Lon_primary'] = row[trust_lon_nm]
    return row

master_zips = master_zips.apply(check_haversine, axis=1)

In [75]:
# What are our worst-case distance errors?
master_zips.sort('Min_Haversine_Dist', ascending=False)

Unnamed: 0,Zip_free,Zip_geo,Zip_non,City_free,City_geo,City_non,State_free,State_geo,State_non,Lat_free,...,Primary_State,Lat_TwoFishes,Lon_TwoFishes,Max_Haversine_Dist,Max_Dist_Pair,Min_Haversine_Dist,Min_Dist_Pair,Min_Dist_Source,Lat_primary,Lon_primary
29798,99557,99557,99557,chuathbaluk,aniak,aniak,ak,ak,ak,61.20,...,ak,61.57833,-159.52222,267.901140,Non vs. Geo,33.943132,TwoFishes vs Non,Twofishes,61.578330,-159.522220
29799,99557,99557,99557,stony river,aniak,aniak,ak,ak,ak,61.20,...,ak,61.57833,-159.52222,267.901140,Non vs. Geo,33.943132,TwoFishes vs Non,Twofishes,61.578330,-159.522220
29797,99557,99557,99557,aniak,aniak,aniak,ak,ak,ak,61.20,...,ak,61.57833,-159.52222,267.901140,Non vs. Geo,33.943132,TwoFishes vs Non,Twofishes,61.578330,-159.522220
31165,92328,92328,92328,death valley,death valley,death valley,ca,ca,ca,36.29,...,ca,36.24662,-116.81700,77.671735,Non vs Free,25.454930,TwoFishes vs Geo,Twofishes,36.246620,-116.817000
28220,97721,97721,97721,princeton,princeton,princeton,or,or,or,42.86,...,or,44.00013,-120.50139,210.382953,TwoFishes vs. Free,25.420655,Non vs Free,Non,43.036496,-118.618820
27060,86044,86044,86044,tonalea,tonalea,tonalea,az,az,az,36.91,...,az,36.32249,-110.96348,65.817370,TwoFishes vs. Free,22.265978,TwoFishes vs Non,Twofishes,36.322490,-110.963480
23750,69340,69340,69340,ellsworth,ellsworth,ellsworth,ne,ne,ne,42.06,...,ne,42.30639,-98.00954,360.852678,TwoFishes vs Geo,20.318648,Non vs Free,Non,42.232420,-102.197920
25870,84533,84533,84533,halls xing,lake powell,lake powell,ut,ut,ut,37.57,...,ut,37.03654,-111.35330,135.044254,TwoFishes vs Non,19.609828,Non vs. Geo,Non,37.655431,-110.037720
25867,84533,84533,84533,lake powell,lake powell,lake powell,ut,ut,ut,37.57,...,ut,37.03654,-111.35330,135.044254,TwoFishes vs Non,19.609828,Non vs. Geo,Non,37.655431,-110.037720
25869,84533,84533,84533,halls crossing,lake powell,lake powell,ut,ut,ut,37.57,...,ut,37.03654,-111.35330,135.044254,TwoFishes vs Non,19.609828,Non vs. Geo,Non,37.655431,-110.037720


In [79]:
len(master_zips[master_zips['Min_Haversine_Dist'] > 5])

474

In [87]:
final_cols = master_zips[['Zip_free', 'Primary_City', 'Primary_State',
                          'Lat_primary', 'Lon_primary']]

In [97]:
final_zips = final_cols.drop_duplicates()
final_zips = final_zips.rename(columns={'Zip_free': 'Zipcode',
                                        'Primary_City': 'City',
                                        'Primary_State': 'State',
                                        'Lat_primary': 'Latitude',
                                        'Lon_primary': 'Longitude'})
final_zips

Unnamed: 0,Zipcode,City,State,Latitude,Longitude
0,00705,aibonito,pr,18.139960,-66.266000
1,00610,anasco,pr,18.282730,-67.139620
2,00611,angeles,pr,18.284950,-66.799340
3,00612,arecibo,pr,18.472440,-66.715730
4,00601,adjuntas,pr,18.163790,-66.723690
5,00631,castaner,pr,18.180510,-66.832120
7,00602,aguada,pr,18.379390,-67.188240
8,00603,aguadilla,pr,18.427450,-67.154070
10,00604,aguadilla,pr,18.427450,-67.154070
12,00605,aguadilla,pr,18.427450,-67.154070


In [92]:
# How many unique zip codes as compared to our original sets?
len(set(final_zips['Zip_free']))

41719