In [1]:
import pandas as pd
import numpy as np

# Cleaning data
Some types of data cleaning:
- Renaming columns or indexes
- Removing irrelevant columns
- Splitting or combining columns
- Removing non-data, repeated, or rows with missing data
- Replacing `NaN` data with values, or interpolating
- Standardising strings or fixing typos
- Removing whitespace
- Correcting datatypes
- Identifying and removing/isolating outliers

## How much?
- `shape[0]` and `count` can be used to determine how many `NaN` values there are, as `shape` includes those values and `count` does not.
- `s.isnull().sum()` also provides this when called on a series.
- `df.isnull().sum()` tells you how many `NaN` values are in each column
- `df.info()` will show some overall summary data, and will show counts if the size is below `pd.options.display.max_info_columns` or if passed `show_counts=True`

## pandas vs numpy
`isnull()` and `isna()` are the same in pandas, but are *different* from the numpy `np.isnan` method - use the pandas methods.

Also pandas uses `~` to do boolean inversion on series and dataframes, which is good to know.

In [None]:
df = pd.read_csv(
    "../data/nyc-parking-violations-2020.csv",
    usecols=[
        "Plate ID",
        "Registration State",
        "Vehicle Make",
        "Vehicle Color",
        "Violation Time",
        "Street Name",
    ],
)
org_rows = len(df)
df = df.dropna()
non_nan_rows = len(df)

print(f"{org_rows:,} original and {non_nan_rows:,} after dropping NaN values.")
print(
    f"At $100 a ticket this is ${(org_rows - non_nan_rows) * 100:,} of possibly contested revenue."
)

12,495,734 original and 12,048,375 after dropping NaN values.
At $100 a ticket this is $44,735,900 of possibly contested revenue.


In [None]:
# update this process to only happen if the licence plate, state, car make, or street name are missing
df = pd.read_csv(
    "../data/nyc-parking-violations-2020.csv",
    usecols=[
        "Plate ID",
        "Registration State",
        "Vehicle Make",
        "Vehicle Color",
        "Violation Time",
        "Street Name",
    ],
)
org_rows = len(df)
df = df[
    df["Plate ID"].notnull()
    & df["Registration State"].notnull()
    & df["Vehicle Make"].notnull()
    & df["Street Name"].notnull()
]
# apparently len(df.index) is a fair bit faster than just len() and also .shape or .size
non_nan_rows = len(df.index)

print(f"{org_rows:,} original and {non_nan_rows:,} after dropping NaN values.")
print(
    f"At $100 a ticket this is ${(org_rows - non_nan_rows) * 100:,} of possibly contested revenue."
)

# a nicer way of doing this for the subset is this:
# df = df.dropna(subset=["Plate ID", "Registration State", "Vehicle Make", "Street Name"])
# this also comes with an additional keyword argument `thresh` which means it'll only happen
# if the number of na values is greater than or equal to the threshold value:
# df = df.dropna(subset=["Plate ID", "Registration State", "Vehicle Make", "Street Name"], thresh=3)

12,495,734 original and 12,431,949 after dropping NaN values.
At $100 a ticket this is $6,378,500 of possibly contested revenue.


In [9]:
# only going to drop Plate ID, Registration State, and Street Name invalid values this time
df = pd.read_csv(
    "../data/nyc-parking-violations-2020.csv",
    usecols=[
        "Plate ID",
        "Registration State",
        "Vehicle Make",
        "Vehicle Color",
        "Violation Time",
        "Street Name",
    ],
)
org_rows = len(df.index)
df = df.dropna(subset=["Plate ID", "Registration State", "Street Name"])
non_nan_rows = len(df.index)

print(f"{org_rows:,} original and {non_nan_rows:,} after dropping NaN values.")
print(
    f"At $100 a ticket this is ${(org_rows - non_nan_rows) * 100:,} of possibly contested revenue."
)

12,495,734 original and 12,494,116 after dropping NaN values.
At $100 a ticket this is $161,800 of possibly contested revenue.


# Extension questions
1. How many rows would be dropped if we wanted at least three of Plate ID, Registration State, Vehicle Make, and Street Name to be invalid?
2. Which of the columns you've imported has the greatest number of `NaN` values? Is this a problem?
3. Null data is bad, but there is plenty of bad non-null data too. For example many cars with BLANKPLATE were ticketed. Turn these into NaN values and rerun the previous query.

In [10]:
df = pd.read_csv(
    "../data/nyc-parking-violations-2020.csv",
    usecols=[
        "Plate ID",
        "Registration State",
        "Vehicle Make",
        "Vehicle Color",
        "Violation Time",
        "Street Name",
    ],
)

In [None]:
# 1. drop rows where at least three of the values are NaN
org_rows = len(df.index)
thresh_df = df.dropna(
    subset=["Plate ID", "Registration State", "Vehicle Make", "Street Name"], thresh=3
)
non_nan_rows = len(thresh_df.index)
print(
    f"{org_rows:,} original and {non_nan_rows:,} after dropping NaN values. Delta of {org_rows - non_nan_rows:,}"
)

12,495,734 original and 12,495,481 after dropping NaN values. Delta of 253


In [None]:
df.isna().sum()
# large number of na values on Vehicle Color isn't really a problem as Plate ID, location and make are
# far more important

Plate ID                 202
Registration State         0
Vehicle Make           62420
Violation Time           278
Street Name             1417
Vehicle Color         391982
dtype: int64

In [None]:
df["Plate ID"][df["Plate ID"] == "BLANKPLATE"] = np.nan
df.isna().sum()
# close to 9000 blank plates make a bigger dent in revenue

Plate ID                9084
Registration State         0
Vehicle Make           62420
Violation Time           278
Street Name             1417
Vehicle Color         391982
dtype: int64