# Convert a pandas dataframe to geojson for web-mapping

In [1]:
import pandas as pd, requests, json

First download data from the city of Berkeley's API. You can use Socrata's $limit parameter to specify how many rows to grab (otherwise the default is 1,000 rows of data): https://dev.socrata.com/docs/paging.html

Example request: https://data.cityofberkeley.info/resource/k489-uv4i.json?$limit=5

In [2]:
# API endpoint for city of Berkeley's 311 calls
endpoint_url = 'https://data.cityofberkeley.info/resource/k489-uv4i.json?$limit=2000'

In [3]:
# fetch the URL and load the data
response = requests.get(endpoint_url)
data = response.json()

Next, turn the json data into a dataframe and clean it up a bit: drop unnecessary columns and any rows that lack lat-long data. We want to make our json file as small as possible (prefer under 5 mb) so that it can be loaded over the Internet to anyone viewing your map, without taking forever to download a huge file.

In [4]:
# turn the json data into a dataframe and see how many rows and what columns we have
df = pd.DataFrame(data)

print('We have {} rows'.format(len(df)))
str(df.columns.tolist())

We have 2000 rows


"['apn', 'city', 'indbdate', 'issue_description', 'issue_type', 'latitude', 'location', 'longitude', 'neighborhood_district', 'object_type', 'secondary_issue_type', 'state', 'street_address', 'ticket_closed_date_time', 'ticket_created_date_time', 'ticket_id', 'ticket_status']"

In [5]:
df.head()

Unnamed: 0,apn,city,indbdate,issue_description,issue_type,latitude,location,longitude,neighborhood_district,object_type,secondary_issue_type,state,street_address,ticket_closed_date_time,ticket_created_date_time,ticket_id,ticket_status
0,,Berkeley,2016-10-11T04:25:05,Illegal Dumping - City Property,"Streets, Utilities, and Transportation",,,,Berkeley,Property,Clean City Program,CA,"Intersection of Atherton and Channing, BERKELE...",2015-05-01T15:23:52,2015-05-01T10:45:57,121000244201,Closed
1,,Berkeley,2016-10-11T04:18:42,Graffiti Abatement - Internet Request,Graffiti and Vandalism,,,,Berkeley,Individual,Graffiti,CA,,2014-09-12T09:01:04,2014-09-10T23:47:32,121000212325,Closed
2,054 172400400,Berkeley,2016-10-11T04:16:30,Illegal Dumping - City Property,"Streets, Utilities, and Transportation",37.8591591,"{'longitude': '-122.27074806', 'latitude': '37...",-122.27074806,Berkeley,Property,Clean City Program,CA,1923 WARD ST,2014-06-20T11:05:01,2014-06-19T10:30:41,121000199692,Closed
3,060 244500103,Berkeley,2016-10-11T04:26:21,Commercial Service Day Change,Refuse and Recycling,37.88254238,"{'longitude': '-122.27747688', 'latitude': '37...",-122.27747688,Berkeley,Property,Commercial,CA,1781 ROSE ST,2013-07-02T13:45:38,2013-06-27T08:27:36,121000148588,Closed
4,054 174502400,Berkeley,2016-10-11T04:21:01,Residential Reminder,Refuse and Recycling,37.85657833,"{'longitude': '-122.28993974', 'latitude': '37...",-122.28993974,Berkeley,Property,Residential,CA,1012 PARDEE ST,2016-01-20T12:06:58,2016-01-19T09:32:01,121000280304,Closed


In [6]:
# convert lat-long to floats and change address from ALL CAPS to Regular Capitalization
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)
df['street_address'] = df['street_address'].str.title()

In [7]:
# we don't need all those columns - only keep useful ones
useful_cols = ['issue_description', 'issue_type', 'latitude', 'longitude', 'street_address', 'ticket_status']
df_subset = df[useful_cols]

In [8]:
# drop any rows that lack lat/long data
df_geo = df_subset.dropna(subset=['latitude', 'longitude'], axis=0, inplace=False)

print('We have {} geotagged rows'.format(len(df_geo)))
df_geo.tail()

We have 940 geotagged rows


Unnamed: 0,issue_description,issue_type,latitude,longitude,street_address,ticket_status
1985,Residential Service Start,Refuse and Recycling,37.859774,-122.2661,2133 Ward St,Closed
1987,Commercial Missed Pickup,Refuse and Recycling,37.866958,-122.274268,1801 Bancroft Way,Closed
1990,Miscellaneous Service Request,General Questions/information,37.873483,-122.275793,1808 Grant St,Closed
1994,Residential Missed Pickup Integration,Refuse and Recycling,37.882808,-122.254349,21 Parnassus Rd,Closed
1998,Residential Cart Size Decrease,Refuse and Recycling,37.863861,-122.252064,2734 Parker St,Closed


In [9]:
# what is the distribution of issue types?
df_geo['issue_type'].value_counts()

Refuse and Recycling                      685
General Questions/information             107
Streets, Utilities, and Transportation     97
Parks, Trees and Vegetation                26
Environmental Services and Programs         9
Business License                            6
Graffiti and Vandalism                      5
Traffic and Transportation                  3
Other Account Services and Billing          1
Equipment Maintenance                       1
Name: issue_type, dtype: int64

Finally, convert each row in the dataframe to a geojson-formatted feature and save the result as a file. The format is pretty simple and you can see it here: http://geojson.org/

In [10]:
def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
    # create a new python dict to contain our geojson data, using geojson format
    geojson = {'type':'FeatureCollection', 'features':[]}

    # loop through each row in the dataframe and convert each row to geojson format
    for _, row in df.iterrows():
        # create a feature template to fill in
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point',
                               'coordinates':[]}}

        # fill in the coordinates
        feature['geometry']['coordinates'] = [row[lon],row[lat]]

        # for each column, get the value and add it as a new feature property
        for prop in properties:
            feature['properties'][prop] = row[prop]
        
        # add this feature (aka, converted dataframe row) to the list of features inside our dict
        geojson['features'].append(feature)
    
    return geojson

In [11]:
cols = ['street_address', 'issue_description', 'issue_type', 'ticket_status']
geojson = df_to_geojson(df_geo, cols)

In [12]:
# save the geojson result to a file
output_filename = 'dataset.js'
with open(output_filename, 'w') as output_file:
    output_file.write('var dataset = {};'.format(json.dumps(geojson)))
    
# how many features did we save to the geojson file?
print('{} geotagged features saved to file'.format(len(geojson['features'])))

940 geotagged features saved to file
