# Creating a dictionary linking US cities to states

https://www.census.gov/library/stories/2020/05/america-a-nation-of-small-towns.html
    
## How many cities in the U.S.?
- approximately 19,500 incorporated places
- About 76% of the approximately 19,500 incorporated places had fewer than 5,000 people.
- Of those, almost 42% had fewer than 500 people. 
- On the other hand, only 4.0% (780) of all cities had a population of 50,000 or more in 2019, yet nearly 39% of the U.S. population 

In [1]:
# import sys
# # sys.path.append('C:\Anaconda3\Lib\site-packages')

from time import time
import pickle

# finding the state based on geotags
from geopy.geocoders import Nominatim

# the Geonamescache library contains information
# about continents, cities and US states
import geonamescache

In [2]:
# get a dictionary of cities: 'c'
gc = geonamescache.GeonamesCache()
c = gc.get_cities()

# extract the US city names and coordinates
US_cities = [c[key]['name'] for key in list(c.keys())
             if c[key]['countrycode'] == 'US']
US_longs = [c[key]['longitude'] for key in list(c.keys())
            if c[key]['countrycode'] == 'US']
US_latts = [c[key]['latitude'] for key in list(c.keys())
            if c[key]['countrycode'] == 'US']

### List of "broken" cities below

These cities show up more than once in the list (i.e., there are cities with the same name in different states)

In [3]:
# how many cities exist more than once?
import collections
duplicates = [item for item, count in collections.Counter(US_cities).items() if count > 1]
print(len(duplicates))
print('')
print(duplicates)

293

['Birmingham', 'Decatur', 'Enterprise', 'Florence', 'Helena', 'Homewood', 'Huntsville', 'Madison', 'Montgomery', 'Oxford', 'Selma', 'Troy', 'Conway', 'Fayetteville', 'Jacksonville', 'Texarkana', 'Washington', 'Dover', 'Middletown', 'Newark', 'Wilmington', 'Bloomingdale', 'Brandon', 'Brownsville', 'Edgewater', 'Gainesville', 'Hollywood', 'Lakeside', 'Leesburg', 'Oak Ridge', 'Palm Springs', 'Parkland', 'Princeton', 'Saint Cloud', 'Sanford', 'Spring Hill', 'University Park', 'Venice', 'West Hollywood', 'Westchester', 'Weston', 'Albany', 'Athens', 'Augusta', 'Brunswick', 'Canton', 'Carrollton', 'Columbus', 'Dublin', 'Duluth', 'Evans', 'Forest Park', 'Martinez', 'Rome', 'Roswell', 'Smyrna', 'Thomasville', 'Union City', 'Woodstock', 'Alton', 'Belleville', 'Charleston', 'Marion', 'Mount Vernon', "O'Fallon", 'Quincy', 'Springfield', 'Avon', 'Bloomington', 'Clarksville', 'Greenfield', 'Greenwood', 'Lawrence', 'New Castle', 'Plainfield', 'Richmond', 'Seymour', 'Shelbyville', 'Gardner', 'Kan

In [4]:
# https://buildmedia.readthedocs.org/media/pdf/geopy/latest/geopy.pdf
# from geopy.extra.rate_limiter import RateLimiter

In [5]:
def get_states(longs, latts):
    ''' Input two 1D lists of floats/ints '''
    # a list of states
    states = []
    # use a coordinate tool from the geopy library
    geolocator = Nominatim(user_agent="http") # , email=''
#     geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    for lon, lat in zip(longs, latts):
        try:
            # get the state name
            location = geolocator.reverse(str(lat)+', '+str(lon))
            state = location.raw['address']['state']
        except:
            # return empty string
            state = ''
        states.append(state)
    return states

https://stackoverflow.com/questions/52600278/correct-way-to-use-geopy-nominatim <br>
https://gis.stackexchange.com/questions/293615/user-agent-argument-in-nominatim-in-geopy

**ConfigurationError:** Using Nominatim with default or sample `user_agent` "geopy/2.3.0" is strongly discouraged, as it violates Nominatim's ToS https://operations.osmfoundation.org/policies/nominatim/ and may possibly cause 403 and 429 HTTP errors. Please specify a custom `user_agent` with `Nominatim(user_agent="my-application")` or by overriding the default `user_agent`: `geopy.geocoders.options.default_user_agent = "my-application"`.

In [6]:
t0 = time()

# find the states of each city
# WARNING: this takes a while
US_states = get_states(US_longs, US_latts)

f"Duration: {(time()-t0)/60} minutes"

'Duration: 27.210380252202352 minutes'

In [5]:
len(US_cities)

3265

In [9]:
with open('pickle/city_to_state.pkl', 'rb') as f:
    city_to_state = pickle.load(f)

In [10]:
# # create a dictionary linking cities
# # as keys with their states

# city_to_state = {}
# for city, state in zip(US_cities, US_states):
#     if state:
#         city_to_state[city] = state

In [11]:
# 2823
print('Number of cities =', len(city_to_state.keys()))
print(city_to_state['Los Angeles'])
print(city_to_state['Ennis'])
print(city_to_state['Long Branch'])

Number of cities = 2849
California
Texas
New Jersey


<br>

### <font color='red'>Check Missing Cities</font>

In [12]:
len(US_cities) # has "broken" cities 

3265

In [13]:
len(set(US_cities))

2824

In [14]:
len(set(city_to_state.keys()))

2849

In [15]:
missing_cities = set(US_cities) - set(city_to_state.keys())
len(missing_cities)

0

<br>

In [16]:
missing = {
'Virginia Beach': 'Virginia',
'Stamford': 'Connecticut',
'Lexington Fayette': 'Kentucky',
'Waterbury': 'Connecticut',
'New York': 'New York',
'Newport News': 'Virginia',
'Hampton': 'Virginia',
'Chesapeake': 'Virginia',
'Hartford': 'Connecticut',
'Saint Louis': 'Missouri',
'Winston Salem': 'North Carolina',
'New Haven': 'Connecticut',
'San Antonio': 'Texas',
'Wichita Falls': 'Texas',
'Richardson': 'Texas',
'Waco': 'Texas',
'Plano': 'Texas',
'Killeen': 'Texas',
'Laredo': 'Texas',
'Vallejo': 'California',
'Visalia': 'California',

'Kāne‘ohe': 'Hawaii',
'Cahokia': 'Illinois',
'Butte-Silver Bow (Balance)': 'Montana',
'Avenal': 'California',
'Sandy City': 'Utah',
'Owosso': 'Michigan',
'North Glendale': 'California',
'Tonawanda': 'New York',
'Beacon': 'New York',
'Saint Petersburg': 'Florida',
'Blytheville': 'Arkansas',
'Scottsbluff': 'Nebraska',
'Forrest City': 'Arkansas',
'Lavergne': 'Tennessee',
'Washington, D.C.': 'District of Columbia',
'Mountain Top': 'Pennsylvania',
'Borough of Queens': 'New York',
'Hopatcong': 'New Jersey',
'Mira Loma': 'California',
'Iron River': 'Michigan',
'Oxnard Shores': 'California',
}

city_to_state = {**city_to_state, **missing}

<br>

In [17]:
# with open('pickle/city_to_state.pkl', 'wb') as f:
#     pickle.dump(city_to_state, f)
    
# with open('pickle/city_to_state.pkl', 'rb') as f:
#     city_to_state = pickle.load(f)

In [18]:
print('Number of cities =', len(city_to_state.keys()))

Number of cities = 2849


In [19]:
for key in city_to_state.keys():
    print('"'+key+'"'+':', '"'+city_to_state[key]+'"'+',')

"Fort Hunt": "Virginia",
"Bessemer": "Alabama",
"Paducah": "Kentucky",
"Birmingham": "Michigan",
"Center Point": "Alabama",
"Cullman": "Alabama",
"Daphne": "Alabama",
"Decatur": "Illinois",
"Dothan": "Alabama",
"East Florence": "Alabama",
"Enterprise": "Nevada",
"Fairhope": "Alabama",
"Florence": "Arizona",
"Foley": "Alabama",
"Gadsden": "Alabama",
"Helena": "Montana",
"Homewood": "Illinois",
"Hoover": "Alabama",
"Hueytown": "Alabama",
"Huntsville": "Texas",
"Madison": "Wisconsin",
"Millbrook": "Alabama",
"Mobile": "Alabama",
"Montgomery": "Illinois",
"Mountain Brook": "Alabama",
"Northport": "Alabama",
"Opelika": "Alabama",
"Oxford": "Ohio",
"Pelham": "Alabama",
"Phenix City": "Alabama",
"Prattville": "Alabama",
"Prichard": "Alabama",
"Selma": "California",
"Talladega": "Alabama",
"Tillmans Corner": "Alabama",
"Troy": "Ohio",
"Trussville": "Alabama",
"Tuscaloosa": "Alabama",
"Vestavia Hills": "Alabama",
"Bella Vista": "Arkansas",
"Benton": "Arkansas",
"Bentonville": "Arkansas",
"Bryan

<br>

In [20]:
# import pickle
# with open('pickle/city_to_state_old.pkl', 'wb') as f:
#     pickle.dump(city_to_state_old, f)

with open('pickle/city_to_state_old.pkl', 'rb') as f:
    city_to_state_old = pickle.load(f)

In [21]:
len(set(city_to_state_old.items()) - set(city_to_state.items())) # "broken" cities 

163

In [22]:
# city_to_state['Albany']

In [23]:
# set(city_to_state_old.items()) - set(city_to_state.items()) # "broken" cities 

In [24]:
len(set(city_to_state_old.keys()) - set(city_to_state.keys()))

0

<br>

### Create a City to Abbreviated State Dictionary

In [25]:
with open('pickle/us_state_to_abbrev.pkl', 'rb') as f:
    us_state_to_abbrev = pickle.load(f)

In [26]:
city_to_st = {}
for key, value in city_to_state.items():
    try:
        city_to_st[key] = us_state_to_abbrev[value]
    except:
        pass

In [27]:
with open('pickle/city_to_st.pkl', 'wb') as f:
    pickle.dump(city_to_st, f)
    
with open('pickle/city_to_st.pkl', 'rb') as f:
    city_to_st = pickle.load(f)

In [29]:
# city_to_st