## Creating a dictionary linking US cities to states

In [1]:
import sys
sys.path.append('C:\Anaconda3\Lib\site-packages')

# finding the state based on geotags
from geopy.geocoders import Nominatim

# the Geonamescache library contains information
# about continents, cities and US states
import geonamescache

In [3]:
# get a dictionary of cities: 'c'
gc = geonamescache.GeonamesCache()
c = gc.get_cities()

# extract the US city names and coordinates
US_cities = [c[key]['name'] for key in list(c.keys())
             if c[key]['countrycode'] == 'US']
US_longs = [c[key]['longitude'] for key in list(c.keys())
            if c[key]['countrycode'] == 'US']
US_latts = [c[key]['latitude'] for key in list(c.keys())
            if c[key]['countrycode'] == 'US']

#### List of "broken" cities below
These cities show up more than once in the list (i.e., there are cities with the same name in different states)

In [22]:
# how many cities exist more than once?
import collections
duplicates = [item for item, count in collections.Counter(US_cities).items() if count > 1]
print(len(duplicates))
print('')
print(duplicates)

228

['Clinton', 'Shawnee', 'White Oak', 'Hollywood', 'Parkland', 'Aberdeen', 'Syracuse', 'Wheeling', 'Troy', 'Carrollton', 'Newark', 'Monroe', 'Clayton', 'Brandon', 'Clarksville', 'Maplewood', 'Centerville', 'Homewood', 'Plainview', 'Montclair', 'Somerset', 'Augusta', 'Fremont', 'Four Corners', 'Covington', 'Ferndale', 'Levittown', 'Huntington', 'Westfield', 'Madison', 'Winchester', 'Lafayette', 'Everett', 'Austin', 'Cary', 'Gardner', 'Brownsville', 'Payson', 'Marion', 'Rochester', 'Columbus', 'Manhattan', 'Belton', 'Brighton', 'Newport', 'Lebanon', 'Carlsbad', 'La Grange', 'Florence', 'Roseville', 'Saint Cloud', 'Beaumont', 'Jackson', "O'Fallon", 'Wasco', 'Frankfort', 'Glendale', 'Shelbyville', 'Henderson', 'Montgomery', 'Lancaster', 'Selma', 'Richmond', 'Laurel', 'Spring Valley', 'Windsor', 'Riverside', 'Saint Charles', 'Bedford', 'Garden City', 'Roswell', 'Wheaton', 'Fayetteville', 'Oakdale', 'Georgetown', 'Jamestown', 'Fitchburg', 'Mount Vernon', 'Clovis', 'Aurora', 'Easton', 'Uni

In [4]:
def get_states(longs, latts):
    ''' Input two 1D lists of floats/ints '''
    # a list of states
    states = []
    # use a coordinate tool from the geopy library
    geolocator = Nominatim()
    for lon, lat in zip(longs, latts):
        try:
            # get the state name
            location = geolocator.reverse(str(lat)+', '+str(lon))
            state = location.raw['address']['state']
        except:
            # return empty string
            state = ''
        states.append(state)
    return states

In [5]:
# find the states of each city
# WARNING: this takes a while
US_states = get_states(US_longs, US_latts)

In [6]:
# create a dictionary linking cities
# as keys with their states

city_to_state = {}
for city, state in zip(US_cities, US_states):
    if state:
        city_to_state[city] = state

In [14]:
print('Number of cities =', len(city_to_state.keys()))
print(city_to_state['Los Angeles'])
print(city_to_state['Ennis'])
print(city_to_state['Long Branch'])

Number of cities = 2361
California
Texas
New Jersey


In [18]:
for key in city_to_state.keys():
    print('"'+key+'"'+':', '"'+city_to_state[key]+'"'+',')

"East Rancho Dominguez": "California",
"Clinton": "Mississippi",
"Nanuet": "New York",
"Sand Springs": "Oklahoma",
"Middle River": "Maryland",
"Carbondale": "Illinois",
"Boise": "Idaho",
"Las Vegas": "Nevada",
"Denver": "Colorado",
"Hagerstown": "Maryland",
"Venice": "Florida",
"Moreno Valley": "California",
"Mamaroneck": "New York",
"Bartow": "Florida",
"Bensonhurst": "New York",
"Edgewater": "Florida",
"Dallas": "Texas",
"Benton": "Arkansas",
"Lake Havasu City": "Arizona",
"New South Memphis": "Tennessee",
"North Glendale": "California",
"Santee": "California",
"Shawnee": "Oklahoma",
"North Augusta": "South Carolina",
"Brownwood": "Texas",
"Methuen": "Massachusetts",
"Allapattah": "Florida",
"White Oak": "Maryland",
"University Place": "Washington",
"Muncie": "Indiana",
"Naperville": "Illinois",
"Hollywood": "Florida",
"Richmond West": "Florida",
"Bellmore": "New York",
"Pine Bluff": "Arkansas",
"Natick": "Massachusetts",
"Silver Lake": "California",
"Ridgecrest": "California",
"Park