# Boolean Indexing with NumPy
## Reading in the taxi data

---

In [1]:
import numpy as np
# remove scientific notation from array results
np.set_printoptions(precision=2, suppress=True)

# the numpy genfromtext function will read in the csv file
taxi = np.genfromtxt('nyc_taxis.csv', delimiter=',')
taxi_shape = taxi.shape
taxi_shape

(2014, 15)

In [2]:
# use the ndarray.dtype attribute to see the internal datatype
print(taxi.dtype)

float64


The `numpy.genfromtxt()` function includes the header by default. Its `skip_header` parameter allows the header to be removed.

In [3]:
taxi = np.genfromtxt('nyc_taxis.csv', delimiter=',', skip_header=1)
taxi_shape = taxi.shape
taxi_shape

(2013, 15)

## Boolean Indexing

---

In [4]:
a = np.array([1, 2, 3, 4, 5])
b = np.array(["blue", "blue", "red", "blue"])
c = np.array([80.0, 103.4, 96.9, 200.3])

a_bool = a < 3
b_bool = b == "blue"
c_bool = c > 100

In [5]:
pickup_month = taxi[:,1]

january_bool = pickup_month == 1
january = pickup_month[january_bool]
january_rides = january.shape[0] # use .shape[0] to change the result from a tuple to an int

february_bool = pickup_month == 2
february = pickup_month[february_bool]
february_rides = february.shape[0]

march_bool = pickup_month == 3
march = pickup_month[march_bool]
march_rides = march.shape[0]

print(march_rides)

554


In [6]:
tip_amount = taxi[:,12]

tip_bool = tip_amount > 50

top_tips = taxi[tip_bool,5:14]

In [7]:
# this creates a copy of the taxi ndarray
taxi_modified = taxi.copy()

In [8]:
print(taxi_modified[1066,5])
taxi_modified[1066,5] = 1
print(taxi_modified[1066,5])

print(taxi_modified[:,0])
taxi_modified[:,0] = 16
print(taxi_modified[:,0])

print(taxi_modified[550:552,7])
taxi_modified[:,7].mean()
taxi_modified[550:552,7] = taxi_modified[:,7].mean()
print(taxi_modified[550:552,7])

4.0
1.0
[2016. 2016. 2016. ... 2016. 2016. 2016.]
[16. 16. 16. ... 16. 16. 16.]
[9.88 8.6 ]
[12.92 12.92]


In [9]:
taxi_copy = taxi.copy()

total_amount = taxi_copy[:,13]

total_amount[total_amount < 0] = 0

## Boolean indexing with shortcut syntax

---

In [10]:
# create a new column filled with `0`.
zeros = np.zeros([taxi.shape[0], 1])
taxi_modified = np.concatenate([taxi, zeros], axis=1)
print(taxi_modified)

taxi_modified[taxi_modified[:,5] == 2, 15] = 1
taxi_modified[taxi_modified[:,5] == 3, 15] = 1
taxi_modified[taxi_modified[:,5] == 5, 15] = 1

[[2016.      1.      1.   ...   69.99    1.      0.  ]
 [2016.      1.      1.   ...   54.3     1.      0.  ]
 [2016.      1.      1.   ...   37.8     2.      0.  ]
 ...
 [2016.      6.     30.   ...   63.34    1.      0.  ]
 [2016.      6.     30.   ...   44.75    1.      0.  ]
 [2016.      6.     30.   ...   54.84    2.      0.  ]]


In [21]:
# count the number of rows that in `dropoff_location_code` that correspond to JFK
jfk = taxi[taxi[:,6] == 2]
jfk_count = jfk.shape[0]
print(jfk_count)

laguardia = taxi[taxi[:,6] == 3]
laguardia_count = laguardia.shape[0]
print(laguardia_count)

newark = taxi[taxi[:,6] == 5]
newark_count = newark.shape[0]
print(newark_count)

285
308
2


In [36]:
trip_mph = taxi[:,7] / (taxi[:,8] / 3600)

cleaned_taxi = taxi[trip_mph < 100]
cleaned_taxi.shape

mean_distance = cleaned_taxi[:,7].mean()
mean_length = cleaned_taxi[:,8].mean()
mean_total_amount = cleaned_taxi[:,13].mean()