In [1]:
import pandas as pd
import numpy as np
import pymongo as pm
import mongoengine as me
import requests
import json
from datetime import datetime
from bson.son import SON
from bson import json_util
import dateutil
import googlemaps
import random
import pprint
import multiprocessing
import matplotlib.pyplot as plt
from IPython.display import clear_output
%matplotlib inline

In [2]:
# pymongo client
client = pm.MongoClient("localhost", 27017)
db = client["cruz-dev"]

# mongoengine connection
me.connect('cruz-dev', host='localhost', port=27017)

# google api client
google_api_keys = {
    "surya": "AIzaSyAU2gGkynk36LibmjTwLKOKMHVTRKIM87k",
    "graham": "AIzaSyBRcJ-Oj88gvz0LWNaCKg42K0K9SQIFpfs"}
gmaps = googlemaps.Client(key=google_api_keys["graham"])

# NYC taxi data api endpoint
stem_url = "https://data.cityofnewyork.us/resource/2yzn-sicd.json?"
headers ={"X-App-Token":"4vs8sRqzFEjUf5BBPu9L0gieK"}

# Creating Zipcode Grid

In [None]:
def get_zipcode_borough_county(lat, long):
    zipcode = np.nan
    borough = ""
    county = ""
    try:
        response = gmaps.reverse_geocode((lat, long))[0]['address_components']
        for address_component in response:
            if "postal_code" in address_component['types']:
                zipcode = address_component['long_name']
            if "sublocality_level_1" in address_component['types']:
                borough = address_component['long_name']
            if "administrative_area_level_2" in address_component['types']:
                county = address_component['long_name']
    except:
        pass
    return zipcode, borough, county

In [None]:
def lat_long_grid_search(params):
    lat_low, lat_high, lat_increment, long_low, long_high, long_increment = params
    zip_grid_list = []
    for lat in np.arange(lat_low, lat_high, lat_increment):
        for long in np.arange(long_low, long_high, long_increment):
            zipcode, borough, county = get_zipcode_borough_county(lat, long)
            row_dict = {"latitude": lat,
                        "longitude": long,
                        "zipcode": zipcode,
                        "burough": borough,
                        "county": county
                       }
            zip_grid_list.append(row_dict)

        zip_grid_df = pd.DataFrame(zip_grid_list)
        zip_grid_df.to_csv("zbc_grid.csv", mode='a', header=False, index=False)
        print(lat)
        zip_grid_list = []

In [None]:
# zipcode search parameters
lat_low = 40.4
lat_high = 41
long_low = -74.3
long_high = -73.6
increment = .002
threads = 4

# setting inital zipcode search parameters to construct zipcode grid
args = [(lat,  lat + ((lat_high-lat_low)/threads), increment, long_low, long_high, increment) for lat in np.arange(lat_low, lat_high, ((lat_high-lat_low)/threads))]

# resetting zipcode search parameters to account for partially complete zipcode grid
zipcode_grid = pd.read_csv("zbc_grid.csv", header=None, names=["borough", "county", "latitude", "longitude", "zipcode"])
search_list = []
for search_val in np.arange(lat_low, lat_high, increment):
    check = False
    for existing_val in np.sort(zipcode_grid['latitude'].unique()):
        if abs(search_val - existing_val) < increment:
            check = True
            break
        else:
            continue
    if not check:
        search_list.append(search_val)
search_list.sort()

begin = search_list[0]
args = []
for i, val in enumerate(search_list):
    if i == len(search_list) - 1:
        arg = (begin, val, increment, long_low, long_high, increment)
        args.append(arg)
    elif abs(val-search_list[i+1]) > .003:
        print
        arg = (begin, val, increment, long_low, long_high, increment)
        args.append(arg)
        begin = search_list[i+1]

args

In [None]:
plt.figure(figsize=(5,10))
plt.plot(search_list, 'r.')
plt.plot(np.sort(zipcode_grid["latitude"].unique()), 'g.' )
plt.axis([0,300,40.0,40.6])
plt.plot()
plt.show()

In [None]:
pool = multiprocessing.Pool(processes=len(args)) 
pool.map(lat_long_grid_search, args)

# Ride Classification

In [3]:
# zipcode_grid = pd.read_csv("zbc_grid.csv", header=None, names=["borough", "county", "latitude", "longitude", "zipcode"])
# zipcode_grid[zipcode_grid["zipcode"].notnull()].to_csv("not_null_zbc_grid.csv", index=False)
zipcode_grid = pd.read_csv("not_null_zbc_grid.csv")


In [4]:
def classify_location(lat, long, lat_range, long_range):
    diff = lat_range/2.0 + long_range/2.0
    close_indx = 0
    search = zipcode_grid[((zipcode_grid["latitude"] < lat + lat_range/2.0) & (zipcode_grid["latitude"] > lat - lat_range/2.0)) & ((zipcode_grid["longitude"] < long + long_range/2.0) & (zipcode_grid["longitude"] > long - long_range/2.0))]
    for indx, row in search.iterrows():
        if abs(lat - row["latitude"]) + abs(long - row["longitude"]) < diff:
            diff = abs(lat - row["latitude"]) + abs(long - row["longitude"])
            close_indx = indx
    if len(search) == 0:
        return (np.nan, "", "")
    else:
        zipcode = search.loc[close_indx, "zipcode"]
        borough = search.loc[close_indx, "borough"]
        county = search.loc[close_indx, "county"]
        if borough is np.nan:
            borough = ""
        if county is np.nan:
            county = ""
        return (zipcode, borough, county)


In [5]:
def response_to_doc(response):
    doc = {}

    pickup_zipcode, pickup_borough, pickup_county = classify_location(float(response["pickup_latitude"]), float(response["pickup_longitude"]), .003, .003)
    dropoff_zipcode, dropoff_borough, dropoff_county = classify_location(float(response["dropoff_latitude"]), float(response["dropoff_longitude"]), .003, .003)

    if pickup_zipcode is not np.nan and dropoff_zipcode is not np.nan:
        doc["pickup_datetime"] = dateutil.parser.parse(response['pickup_datetime'])

        doc["dropoff_datetime"] = dateutil.parser.parse(response['dropoff_datetime'])

        doc["pickup_zipcode"] = int(pickup_zipcode)
        doc["pickup_long_lat"] = {"type": "Point", "coordinates":[float(response["pickup_longitude"]), float(response["pickup_latitude"])]}
        if len(pickup_borough) > 0:
            doc["pickup_borough"] = pickup_borough
        if len(pickup_county) > 0:
            doc["pickup_county"] = pickup_county

        doc["dropoff_zipcode"] = int(dropoff_zipcode)
        doc["dropoff_long_lat"] = {"type": "Point", "coordinates":[float(response["dropoff_longitude"]), float(response["dropoff_latitude"])]}
        if len(dropoff_borough) > 0:
            doc["dropoff_borough"] = dropoff_borough
        if len(dropoff_county) > 0:
            doc["dropoff_county"] = dropoff_county

        doc["total_amount"] = float(response["total_amount"])
        doc["tip_amount"] = float(response["tip_amount"])
        doc["fare_amount"] = float(response["fare_amount"])

        doc["passenger_count"] = int(response["passenger_count"])

        doc["trip_distance"] = float(response["trip_distance"])
        
    return doc

In [11]:
max_count = 2500000
init_count = db.rides_15.count()
i = 0

while (db.rides_15.count() < max_count):
    if db.rides_15.count() == 0:
        beginning = "2015-01-01T00:00:00"
    else:
        beginning = str(list(db.rides_15.find().sort("pickup_datetime",pm.DESCENDING).limit(1))[0]['pickup_datetime']).replace(" ", "T")
    full_url = stem_url + "$where=pickup_datetime between '{0}' and '2015-01-08T00:00:00'".format(beginning) 
    for response in requests.get(full_url, headers=headers).json():
        doc = response_to_doc(response)
        if doc != {}:
            db.rides_15.insert_one(doc)
            i+= 1
    
        if (i % 10000 == 0):
            clear_output()
            print(beginning)
            print(str(np.around(100 * i / (max_count - init_count), 2)) + '%')

2015-01-05T20:31:09
25.94%


ConnectionError: HTTPSConnectionPool(host='data.cityofnewyork.us', port=443): Max retries exceeded with url: /resource/2yzn-sicd.json?$where=pickup_datetime%20between%20'2015-01-05T20:36:35'%20and%20'2015-01-08T00:00:00' (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x10e939048>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))