In [1]:
import pandas as pd
import numpy as np

In [None]:
# there are NaN values in the passenger and payment data
# this allows pandas to create NaN by specifying float dtypes
# then drops the NaNs, and changes the columns back to the
# appropriate types
df = (
    pd.read_csv(
        "../data/nyc_taxi_2020-01.csv",
        usecols=["passenger_count", "total_amount", "payment_type"],
        dtype={
            "passenger_count": np.float32,
            "total_amount": np.float32,
            "payment_type": np.float32,
        },
    )
    .dropna()
    .astype({"passenger_count": np.int8, "payment_type": np.int8})
)

df.head()

Unnamed: 0,passenger_count,payment_type,total_amount
0,1,1,11.27
1,1,1,12.3
2,1,1,10.8
3,1,1,8.16
4,1,2,4.8


# Extension questions
1. Create a dataframe from four other columns: VendorID, trip_distance, tip_amount, total_amount, specifying the dtype for each. Which types are most appropriate? Can you use them without first cleaning the data?
2. Instead of removing NaN values from the VendorID, replace them with 3. How does this affect your specifications and the cleaning of the data?
3. The `memory_usage` method allows you to see how much memory is being used by each column in a dataframe. Compare the memory of columns which use float16 and float64.

In [None]:
df2 = pd.read_csv(
    "../data/nyc_taxi_2020-01.csv",
    usecols=["VendorID", "trip_distance", "tip_amount", "total_amount"],
    dtype={
        "VendorID": np.float32,
        "trip_distance": np.float64,
        "tip_amount": np.float16,
        "total_amount": np.float64,
    },
)
# the data dictionary for this specifies that there are only valid vendor IDs of
# 1 and 2, so using 3 for the NaN values allows us to keep the rest of the data but
# still know that it doesn't fit with the known vendors, and keep the np.int8 dtype
df2.loc[np.isnan(df2["VendorID"]), "VendorID"] = 3
df2["VendorID"] = df2["VendorID"].astype(np.int8)
# No, you can't use the data without first cleaning it.

In [None]:
df2.memory_usage()
# As you would expect, float16 is much more memory efficient than float64

Index                 132
VendorID          6405008
trip_distance    51240064
tip_amount       12810016
total_amount     51240064
dtype: int64