# Solution for determining station borough
## Yuwen Chang (ywc249) with assistance from Yukun Wan

## 1. Citi Bike Data

In [1]:
# imports and setups
import numpy as np
import pandas as pd
import pylab as pl
import scipy.stats as st
import os
%pylab inline

# imports Citi Bike data downloader (by Dr. Federica Bianco)
from getCitiBikeCSV import getCitiBikeCSV

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Read data
df = pd.read_csv(os.getenv("PUIDATA") + "/" + '201509' + '-citibike-tripdata.csv')

In [3]:
# Drop unnecessary columns
df2 = df.drop(['tripduration', 'starttime', 'stoptime',
               'start station name', 'end station id', 'end station name',
               'end station latitude', 'end station longitude', 'bikeid', 'usertype',
               'birth year', 'gender'], axis=1)
df2.head()

Unnamed: 0,start station id,start station latitude,start station longitude
0,263,40.71729,-73.996375
1,495,40.762699,-73.993012
2,3119,40.742327,-73.954117
3,536,40.741444,-73.975361
4,347,40.728846,-74.008591


In [4]:
# Subset unique stations (only need one trip data for each station)
# 'duplicated' returns an indicator list for duplicates
# The first appearance of a station id is False, the rest or repeated stations are True
# Use 'not' to reverse the boolean list to select only the first unique station id
unique = [not i for i in df2.duplicated(subset='start station id')]
df3 = df2[unique]
len(df3)

453

## 2. Borough Data

In [5]:
# Use geojson to capture multi-polygon
import geojson
from shapely.geometry import shape, mapping, Point

#import shapefile
#from shapely.geometry import Point, Polygon, MultiPolygon
#from shapely.geometry.polygon import Polygon

In [6]:
# Read geojson borough boundaries data
path = 'Borough Boundaries.geojson'
with open(path) as file:
    borodata = geojson.loads(file.read())

## 3. Geoprocessing: intersect stations (points) with boroughs (polygons)

In [7]:
# First, make the points out of stations
points = list(zip(df3['start station longitude'], df3['start station latitude']))

In [8]:
# Then, pull out Brooklyn and Manhattan from the multi-polygon
Brooklyn = shape(borodata['features'][3]['geometry'])
Manhattan = shape(borodata['features'][4]['geometry'])
#Manhattan.boundary

In [9]:
# Create a new column to fill in borough data
df3['boro']=""
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,start station id,start station latitude,start station longitude,boro
0,263,40.71729,-73.996375,
1,495,40.762699,-73.993012,
2,3119,40.742327,-73.954117,
3,536,40.741444,-73.975361,
4,347,40.728846,-74.008591,


In [10]:
# Intersect both feature geometries to see which borough is a station contained in
for s in range(len(df3)):
    point = Point(df3.iloc[s,2], df3.iloc[s,1])
    if Manhattan.contains(point):
        df3.iloc[s,3] = 'Manhattan'
    elif Brooklyn.contains(point):
        df3.iloc[s,3] = 'Brooklyn'
    else:
        df3.iloc[s,3] = 'Others'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
# Check result
df3.head()

Unnamed: 0,start station id,start station latitude,start station longitude,boro
0,263,40.71729,-73.996375,Manhattan
1,495,40.762699,-73.993012,Manhattan
2,3119,40.742327,-73.954117,Others
3,536,40.741444,-73.975361,Manhattan
4,347,40.728846,-74.008591,Manhattan


In [12]:
# Alternative: if you just want a list
"""
boro=[]
for p in points:
    point = Point(p[0], p[1])
    if Manhattan.contains(point):
        boro.append('Manhattan')
    elif Brooklyn.contains(point):
        boro.append('Brooklyn')
    else:
        boro.append('Others')
"""

"\nboro=[]\nfor p in points:\n    point = Point(p[0], p[1])\n    if Manhattan.contains(point):\n        boro.append('Manhattan')\n    elif Brooklyn.contains(point):\n        boro.append('Brooklyn')\n    else:\n        boro.append('Others')\n"

## 4. Finally, do a m:1 (many-to-one) merge back to the original data set