In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(
    "../data/nyc-parking-violations-2020.csv",
    usecols=[
        "Plate ID",
        "Registration State",
        "Vehicle Make",
        "Vehicle Color",
        "Vehicle Body Type",
        "Violation Time",
        "Street Name",
        "Violation Legal Code",
    ],
)

  df = pd.read_csv(


In [4]:
df.memory_usage(deep=True) // (1024 * 1024)  # MB usage

Index                     0
Plate ID                665
Registration State      607
Vehicle Body Type       628
Vehicle Make            638
Violation Time          643
Street Name             743
Violation Legal Code    457
Vehicle Color           608
dtype: int64

In [5]:
for col in df.columns:
    print(f"Changing '{col}' to category...", end="")
    df[col] = df[col].astype("category")
    print("done!")

Changing 'Plate ID' to category...done!
Changing 'Registration State' to category...done!
Changing 'Vehicle Body Type' to category...done!
Changing 'Vehicle Make' to category...done!
Changing 'Violation Time' to category...done!
Changing 'Street Name' to category...done!
Changing 'Violation Legal Code' to category...done!
Changing 'Vehicle Color' to category...done!


In [6]:
df.memory_usage(deep=True) // (1024 * 1024)

Index                     0
Plate ID                349
Registration State       11
Vehicle Body Type        23
Vehicle Make             24
Violation Time           23
Street Name              53
Violation Legal Code     11
Vehicle Color            23
dtype: int64

In [7]:
df.head()

Unnamed: 0,Plate ID,Registration State,Vehicle Body Type,Vehicle Make,Violation Time,Street Name,Violation Legal Code,Vehicle Color
0,J58JKX,NJ,SDN,HONDA,0523P,43 ST,,BK
1,KRE6058,PA,SUBN,ME/BE,0428P,UNION ST,,BLK
2,444326R,NJ,SDN,LEXUS,0625A,CLERMONT AVENUE,,BLACK
3,F728330,OH,SDN,CHEVR,1106A,DIVISION AVE,,
4,FMY9090,NY,SUBN,JEEP,1253A,GRAND ST,,GREY


# Extension
Read only the first 10_000 lines from the CSV file but all of the columns.
Show the 10 columns that will most likely benefit the most from using categories.

In [23]:
df10k = pd.read_csv("../data/nyc-parking-violations-2020.csv", nrows=10_000)
otypes = df10k.dtypes[df10k.dtypes == object]
otypes
# looking at these, there are 25 columns which are `object` type, and so should use up
# more memory

Plate ID                             object
Registration State                   object
Plate Type                           object
Issue Date                           object
Vehicle Body Type                    object
Vehicle Make                         object
Issuing Agency                       object
Issuer Command                       object
Issuer Squad                         object
Violation Time                       object
Time First Observed                  object
Violation County                     object
Violation In Front Of Or Opposite    object
House Number                         object
Street Name                          object
Intersecting Street                  object
Sub Division                         object
Violation Legal Code                 object
Days Parking In Effect               object
From Hours In Effect                 object
To Hours In Effect                   object
Vehicle Color                        object
Meter Number                    

In [24]:
# here are the non-object
df10k.dtypes[df10k.dtypes != object]

Summons Number                         int64
Violation Code                         int64
Street Code1                           int64
Street Code2                           int64
Street Code3                           int64
Vehicle Expiration Date                int64
Violation Location                   float64
Violation Precinct                     int64
Issuer Precinct                        int64
Issuer Code                            int64
Date First Observed                    int64
Law Section                            int64
Unregistered Vehicle?                float64
Vehicle Year                           int64
Feet From Curb                         int64
No Standing or Stopping Violation    float64
Hydrant Violation                    float64
Double Parking Violation             float64
dtype: object

In [25]:
# count the percentage of unique values for the object types
# the assumption is that these will have the biggest impact, being strings and low numbers
# of unique values
predicted_top_10 = (
    (df10k.count() / df10k[otypes.index].nunique())
    .dropna()
    .sort_values(ascending=False)
    .iloc[:10]
)
predicted_top_10

Violation Description                5615.000000
Violation Legal Code                 5615.000000
Violation County                     1086.333333
Issuing Agency                        909.090909
Violation In Front Of Or Opposite     796.800000
Plate Type                            344.827586
Issuer Squad                          257.941176
Sub Division                          196.039216
Registration State                    192.307692
Days Parking In Effect                122.181818
dtype: float64

In [26]:
# convert every column to categories to check the memory savings
initial_memory = df10k.memory_usage(deep=True, index=False)
for col in df10k.columns:
    df10k[col] = df10k[col].astype("category")
categorical_memory = df10k.memory_usage(deep=True, index=False)

In [27]:
actual_top_10 = ((categorical_memory / initial_memory).sort_values() * 100).iloc[:10]
actual_top_10

Violation Description                1.969895
Violation County                     2.140093
Issuing Agency                       2.170000
Violation Legal Code                 2.412425
Plate Type                           2.419231
Violation In Front Of Or Opposite    2.660628
Issuer Squad                         2.813980
Registration State                   2.893333
Sub Division                         2.919663
Days Parking In Effect               3.117291
dtype: float64

In [28]:
# now let's compare the our predicted savings vs actual savings
sorted(predicted_top_10.index) == sorted(actual_top_10.index)
# some of the positions were different, but overall the top 10 actual savings were also the top 10 predictions

True

## Memory savings conclusion
The book simply looks at how many unique values there are, regardless of dtype:

```python
(df.count() / df.nunique()).sort_values(ascending=False).head(10)
```

This results in a number of numeric or boolean data surfacing in the "best replaced with categories" list.

In [29]:
# let's look at the result of the naive approach
filename = "../data/nyc-parking-violations-2020.csv"

dfbook = pd.read_csv(filename, nrows=10_000)
column_list = (dfbook.count() / dfbook.nunique()).sort_values(ascending=False).head(10)

(
    categorical_memory[column_list.index] / initial_memory[column_list.index]
).sort_values() * 100

Violation Description                 1.969895
Violation County                      2.140093
Issuing Agency                        2.170000
Violation Legal Code                  2.412425
Plate Type                            2.419231
Violation In Front Of Or Opposite     2.660628
Unregistered Vehicle?                12.645000
Law Section                          12.665000
Feet From Curb                       12.995000
Date First Observed                  13.445000
dtype: float64

So the top 6 items were text values and so saw huge memory savings. The `Unregistered Vehicle?`, `Law Section`, `Feet From Curb`, `Date First Observed` fields saw improvements but were an order of magnitude difference.
- `Unregistered Vehicle?`: `np.float64`
- `Law Section`: `np.int64`
- `Feet From Curb`: `np.int64`
- `Date First Observed`: `np.int64`