In [1]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(
    "../data/nyc-parking-violations-2020.csv",
    usecols=[
        "Plate ID",
        "Registration State",
        "Vehicle Make",
        "Vehicle Color",
        "Street Name",
    ],
)

In [3]:
# how many different colours are there?
df["Vehicle Color"].describe()

count     12103752
unique        1896
top             WH
freq       2344858
Name: Vehicle Color, dtype: object

In [20]:
# let's make a dictionary of replacements for colour names
replacements = {
    "WH": "WHITE",  # whites
    "WHT": "WHITE",
    "WT": "WHITE",
    "WT.": "WHITE",
    "WHI": "WHITE",
    "WH.": "WHITE",
    "W": "WHITE",
    "WHT.": "WHITE",
    "GY": "GREY",  # greys
    "GY.": "GREY",
    "GRAY": "GREY",
    "GRY": "GREY",
    "GREY.": "GREY",
    "GRAY.": "GREY",
    "GRY.": "GREEN",
    "BL": "BLACK",  # blacks
    "BLK": "BLACK",
    "BLK.": "BLACK",
    "BK": "BLACK",
    "BK.": "BLACK",
    "BL.": "BLACK",
    "OR": "ORANGE",  # oranges
    "OR.": "ORANGE",
    "ORANG": "ORANGE",
    "YW": "YELLOW",  # yellows
    "YELLO": "YELLOW",
    "YELL": "YELLOW",
    "YEL": "YELLOW",
    "BR": "BROWN",  # browns
    "BRN": "BROWN",
    "BRO": "BROWN",
    "BN": "BROWN",
    "BRW": "BROWN",
    "TN": "TAN",  # tans
    "TN.": "TAN",
    "SILVE": "SILVER",  # silvers
    "SIL": "SILVER",
    "SILV": "SILVER",
    "SL.": "SILVER",
    "SL": "SILVER",
    "RD": "RED",  # reds
    "RD.": "RED",
    "RED.": "RED",
    "BLU": "BLUE",  # blues
    "BLUE.": "BLUE",
    "LTGY": "LIGHT GREY",  # light grey
    "LTG": "LIGHT GREY",
    "GRN": "GREEN",  # greens
    "GN": "GREEN",
    "GN.": "GREEN",
    "PURPL": "PURPLE",  # purples
    "GLD": "GOLD",  # golds
    "MAROO": "MAROON",  # maroons
    "MR": "MAROON",
    "MAR": "MAROON",
}

# replace the colour variants with the canonical colour name
df["Vehicle Color"] = df["Vehicle Color"].replace(replacements)

# top 30 colours and variations
print(len(df["Vehicle Color"].unique()))

1848


In [21]:
# let's look at the top 50 now
print(df["Vehicle Color"].value_counts().head(50))

Vehicle Color
WHITE         3535435
BLACK         3440479
GREY          2727585
RED            654260
BROWN          406804
SILVER         236871
BLUE           196176
GR             182929
TAN            142585
YELLOW         134081
GREEN           70140
OTHER           60245
GL              54851
MAROON          45104
ORANGE          39936
GOLD            23167
LIGHT GREY      23148
LT/              8976
PR               7518
DK/              7498
DKGY             6004
GYGY             5039
B                4145
DKG              3702
PURPLE           3635
BKGY             3504
WHBL             3489
DKBL             2912
WHGY             2381
UNKNO            2205
G                2064
DKB              1976
WH/              1908
BLGY             1901
BLW              1456
GY/              1380
DKRD             1318
LTBL             1251
GYBL             1155
BW                979
BKBL              964
LTB               961
GYGR              956
BG.               942
RDW               

- Pretty consistent shortening by removing vowels.
- Also variants that have the colour or shorter colour with a dot following.
- Lots of combinations of colours or variants (`"DKGR"` presumable for dark green)

There are two problems here, first ambiguous colour abbreviations (like `"B"`), and a
data collection mechanism that allows for the ambiguity to begin with, like allowing for
"purple" as well as "lavender" or multiple variations on "light grey". Does it need
to be this flexible? Could we get away with just roughly bucketing cars into colour
presets, with an optional details field for something like decals or strange colours.

# Extension questions
1. For the `"Vehicle Make"` column, write a function that, given a value, cleans up by removing punctuation, changes to all caps. Compare number of makes before and after.
2. How standardised are the `"Street Name"` values in the data set? What changes could be applied to change things?
3. Would you need to clean up the `"Registration State"` column? Why or why not?

In [22]:
# 1. data cleaning function for Vehicle Make
# first count the number of unique vehicle makes beforehand
df["Vehicle Make"].value_counts()

Vehicle Make
TOYOT    1395273
HONDA    1343265
FORD     1328063
NISSA    1119587
CHEVR     711464
          ...   
BEAVE          1
NELSO          1
HOWBY          1
BONEE          1
KIA (          1
Name: count, Length: 5210, dtype: int64

In [None]:
from string import punctuation

make_changes = 0


def clean_makes(val):
    nval = val.strip(" " + punctuation).upper()
    global make_changes
    if nval != val:
        make_changes += 1
    return nval


print(df["Vehicle Make"].dropna().apply(clean_makes).value_counts())
print(f"Total changes: {make_changes}")

Vehicle Make
TOYOT    1396714
HONDA    1344479
FORD     1329491
NISSA    1121782
CHEVR     712494
          ...   
PONTC          1
NASER          1
LO/RO          1
WENT           1
KASAK          1
Name: count, Length: 5137, dtype: int64
Total changes: 12403


In [45]:
# 2. Investigate how standardised the street name values are
df["Street Name"] = df["Street Name"].fillna("unknown")
print(f"Different street names: {df['Street Name'].shape[0]:,}")
df["Street Name"].value_counts()

Different street names: 12,495,734


Street Name
Broadway                180225
3rd Ave                 133003
5th Ave                  78211
2nd Ave                  75533
Madison Ave              75419
                         ...  
I/O W 164 ST                 1
HUTINSON RIVER PARKW         1
BEACH 58                     1
HUTINSON RIVER PARK          1
W/S/O 182 STREET             1
Name: count, Length: 57758, dtype: int64

In [None]:
# let's look at how many abbreviations we have by splitting and finding the end of each street name
# then doing a value count
df["Street Name"].dropna().apply(lambda x: x.split()[-1].upper()).value_counts().head(
    30
)

Street Name
ST          3782443
AVE         2948683
BLVD         341386
@            334026
RD           259389
BROADWAY     208559
PL           169125
S            149796
AV           124172
DR           112268
B             95725
1             87681
E             85117
STREET        84943
A             82451
CL            67215
W             65981
BLV           58935
CR            53155
EX            50283
T             49168
AVENUE        48616
PKWY          48048
PKY           47485
GL            47090
J             42941
KATO          42727
U             41780
ROC           41674
SH            39244
Name: count, dtype: int64

In [None]:
# let's clean up the types of streets and normalise things to upper case
transforms = {
    "AV": "AVE",
    "STREET": "ST",
    "BLV": "BLVD",
    "AVENUE": "AVE",
    "PKY": "PKWY",
}


def clean_streets(val):
    vals = val.upper().strip(punctuation).split()
    for k, v in transforms.items():
        if vals[-1] == k:
            vals[-1] = v
            break
    return " ".join(vals)


cleaned_streets = df["Street Name"].apply(clean_streets)
cleaned_streets.value_counts()

Street Name
BROADWAY            190996
3RD AVE             140783
5TH AVE              84356
MADISON AVE          81896
2ND AVE              80548
                     ...  
W/S C/O 44               1
W/S N/O 4TH              1
N/O E 50 ST              1
N/S MERTENSE AVE         1
W/S/O 182 ST             1
Name: count, Length: 46444, dtype: int64

This resulted in a reduction of around 12,000 street names.

In [49]:
# 3. Would the registration state column need cleaning?
df["Registration State"].value_counts()

Registration State
NY    9753643
NJ    1096110
PA     338779
FL     174056
CT     165205
       ...   
PE         18
SK          8
MX          7
NT          3
YT          2
Name: count, Length: 68, dtype: int64

68 different codes, which is 50 states plus some other regional stuff like Canada and territories?
Let's have a look.

In [None]:
state_codes = """Alabama: AL
Alaska: AK
Arizona: AZ
Arkansas: AR
California: CA
Colorado: CO
Connecticut: CT
Delaware: DE
District of Columbia: DC
Florida: FL
Georgia: GA
Hawaii: HI
Idaho: ID
Illinois: IL
Indiana: IN  
Iowa: IA
Kansas: KS
Kentucky: KY
Louisiana: LA
Maine: ME
Maryland: MD
Massachusetts: MA  
Michigan: MI
Minnesota: MN
Mississippi: MS
Missouri: MO
Montana: MT
Nebraska: NE
Nevada: NV
New Hampshire: NH
New Jersey: NJ
New Mexico: NM
New York: NY  
North Carolina: NC
North Dakota: ND
Ohio: OH
Oklahoma: OK
Oregon: OR
Pennsylvania: PA
Rhode Island: RI
South Carolina: SC
South Dakota: SD
Tennessee: TN
Texas: TX  
Utah: UT
Vermont: VT
Virginia: VA
Washington: WA
West Virginia: WV
Wisconsin: WI
Wyoming: WY
American Samoa: AS   
Guam: GU
Northern Mariana Islands: MP
Puerto Rico: PR
US Virgin Islands: VI"""
codes = [x.split(": ")[-1] for x in state_codes.split("\n")]

56


In [None]:
df["Registration State"][
    ~df["Registration State"].str.strip().isin(codes)
].value_counts()

Registration State
NY    9753643
IN     138824
MA      83726
TX      54600
99      24654
GV      21283
ON       5548
QB       3551
DP       2904
AB        175
NS        112
BC         99
NB         94
FO         55
MB         29
PE         18
SK          8
MX          7
NT          3
YT          2
Name: count, dtype: int64

Not sure why NY is in there, since it's a valid state code, but the 99 is a bit of a weird one, so could need investigation.