# Step 1: Load the 2014 Data (10k Rows)

In [10]:
import pandas as pd
import numpy as np

# Load 10k rows from 2014 file
file_2014 = "Parking_Violations_Issued_-_Fiscal_Year_2014.csv"
df = pd.read_csv(file_2014, nrows=10000, low_memory=False)  # low_memory=False avoids mixed-type warnings
print(f"Loaded {len(df)} rows.")

Loaded 10000 rows.


# Step 2: Initial Data Inspection
Let’s understand the data structure before cleaning:

In [2]:
# Quick overview
print("=== Data Shape ===")
print(df.shape)  # (rows, columns)

print("\n=== Column Names ===")
print(df.columns.tolist())  # List all columns

print("\n=== Sample Rows ===")
display(df.head(3))  # Show first 3 rows

print("\n=== Missing Values ===")
print(df.isna().sum())  # Count missing values per column

=== Data Shape ===
(10000, 51)

=== Column Names ===
['Summons Number', 'Plate ID', 'Registration State', 'Plate Type', 'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make', 'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3', 'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct', 'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad', 'Violation Time', 'Time First Observed', 'Violation County', 'Violation In Front Of Or Opposite', 'House Number', 'Street Name', 'Intersecting Street', 'Date First Observed', 'Law Section', 'Sub Division', 'Violation Legal Code', 'Days Parking In Effect    ', 'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color', 'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number', 'Feet From Curb', 'Violation Post Code', 'Violation Description', 'No Standing or Stopping Violation', 'Hydrant Violation', 'Double Parking Violation', 'Latitude', 'Longitude', 'Community Board', 'Community Council ', 'Census 

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,Hydrant Violation,Double Parking Violation,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,1283294138,GBB9093,NY,PAS,08/04/2013,46,SUBN,AUDI,P,37250,...,,,,,,,,,,
1,1283294151,62416MB,NY,COM,08/04/2013,46,VAN,FORD,P,37290,...,,,,,,,,,,
2,1283294163,78755JZ,NY,COM,08/05/2013,46,P-U,CHEVR,P,37030,...,,,,,,,,,,



=== Missing Values ===
Summons Number                           0
Plate ID                                 1
Registration State                       0
Plate Type                               0
Issue Date                               0
Violation Code                           0
Vehicle Body Type                      263
Vehicle Make                           692
Issuing Agency                           0
Street Code1                             0
Street Code2                             0
Street Code3                             0
Vehicle Expiration Date                  0
Violation Location                     102
Violation Precinct                       0
Issuer Precinct                          0
Issuer Code                              0
Issuer Command                           0
Issuer Squad                             0
Violation Time                           4
Time First Observed                   9209
Violation County                       398
Violation In Front Of Or Oppos

# Step 3: Targeted Cleaning Tasks
We’ll focus on these key areas (adjust based on your inspection):

### 1. Standardize Column Names

In [3]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("Renamed columns:", df.columns.tolist())

Renamed columns: ['summons_number', 'plate_id', 'registration_state', 'plate_type', 'issue_date', 'violation_code', 'vehicle_body_type', 'vehicle_make', 'issuing_agency', 'street_code1', 'street_code2', 'street_code3', 'vehicle_expiration_date', 'violation_location', 'violation_precinct', 'issuer_precinct', 'issuer_code', 'issuer_command', 'issuer_squad', 'violation_time', 'time_first_observed', 'violation_county', 'violation_in_front_of_or_opposite', 'house_number', 'street_name', 'intersecting_street', 'date_first_observed', 'law_section', 'sub_division', 'violation_legal_code', 'days_parking_in_effect', 'from_hours_in_effect', 'to_hours_in_effect', 'vehicle_color', 'unregistered_vehicle?', 'vehicle_year', 'meter_number', 'feet_from_curb', 'violation_post_code', 'violation_description', 'no_standing_or_stopping_violation', 'hydrant_violation', 'double_parking_violation', 'latitude', 'longitude', 'community_board', 'community_council', 'census_tract', 'bin', 'bbl', 'nta']


### 2. Clean Dates (issue_date)
Convert to datetime and extract useful features:

In [4]:
df['issue_date'] = pd.to_datetime(df['issue_date'], errors='coerce')  # 'coerce' invalid dates to NaT
df['issue_year'] = df['issue_date'].dt.year
df['issue_month'] = df['issue_date'].dt.month_name()
print("\nDate cleaning done. Added year/month columns.")


Date cleaning done. Added year/month columns.


### 3. Clean violation_time (Advanced Handling)
Convert messy time formats (e.g., 1030A = 10:30 AM):

In [5]:
# Ensure violation_time is string type
df['violation_time'] = df['violation_time'].astype(str).str.strip()

# Extract components - now returns a DataFrame with named columns
time_parts = df['violation_time'].str.extract(r'(?P<hour>\d{1,2})(?P<minute>\d{2})(?P<period>[AP]?)')

# Convert to numeric (keeping NaN where invalid)
time_parts['hour'] = pd.to_numeric(time_parts['hour'], errors='coerce')
time_parts['minute'] = pd.to_numeric(time_parts['minute'], errors='coerce')

# Clean period (AM/PM) - default to AM if missing
time_parts['period'] = time_parts['period'].str.upper().replace({'': 'A', None: 'A'})

# Convert to 24-hour format
time_parts.loc[time_parts['period'] == 'P', 'hour'] += 12
time_parts['hour'] = time_parts['hour'].replace(24, 0)  # Handle midnight

# Combine with issue_date to create datetime
df['violation_datetime'] = pd.to_datetime(
    df['issue_date'].dt.date.astype(str) + ' ' + 
    time_parts['hour'].astype(str) + ':' + 
    time_parts['minute'].astype(str),
    errors='coerce'
)

# Optional: Add cleaned time components back to dataframe
df['violation_hour'] = time_parts['hour']
df['violation_minute'] = time_parts['minute']

# Show results
print("Successfully cleaned violation times:")
print(df[['violation_time', 'violation_hour', 'violation_minute', 'violation_datetime']].head())

  df['violation_datetime'] = pd.to_datetime(


Successfully cleaned violation times:
  violation_time  violation_hour  violation_minute  violation_datetime
0          0752A             7.0              52.0 2013-08-04 07:52:00
1          1240P             0.0              40.0 2013-08-04 00:40:00
2          1243P             0.0              43.0 2013-08-05 00:43:00
3          0232P            14.0              32.0 2013-08-05 14:32:00
4          1239P             0.0              39.0 2013-08-08 00:39:00


### 4. Clean registration_state
Keep only valid US state codes (2-letter abbreviations):

In [6]:
valid_states = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA',
                'KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ',
                'NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT',
                'VA','WA','WV','WI','WY','DC','PR','VI']

df['registration_state'] = df['registration_state'].str.upper().str.strip()
df['registration_state'] = df['registration_state'].where(
    df['registration_state'].isin(valid_states), 'OTHER'
)
print("\nStates cleaned. Invalid values marked as 'OTHER'.")


States cleaned. Invalid values marked as 'OTHER'.


### 5. Handle Missing Data
Drop columns with >70% missing values:

In [7]:
missing_pct = df.isna().mean()
cols_to_drop = missing_pct[missing_pct > 0.7].index.tolist()
df = df.drop(columns=cols_to_drop)
print(f"Dropped columns with >70% missing: {cols_to_drop}")

Dropped columns with >70% missing: ['time_first_observed', 'intersecting_street', 'violation_legal_code', 'violation_post_code', 'violation_description', 'no_standing_or_stopping_violation', 'hydrant_violation', 'double_parking_violation', 'latitude', 'longitude', 'community_board', 'community_council', 'census_tract', 'bin', 'bbl', 'nta']


# Step 4: Save Cleaned Data

In [8]:
output_file = "cleaned_2014_sample.csv"
df.to_csv(output_file, index=False)
print(f"\nSaved cleaned data to {output_file}")


Saved cleaned data to cleaned_2014_sample.csv


# Step 5: Verify Results
Check the cleaned data:

In [9]:
print("\n=== Final Data Summary ===")
print(f"Columns: {df.shape[1]}, Rows: {df.shape[0]}")
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isna().sum())


=== Final Data Summary ===
Columns: 40, Rows: 10000

Data Types:
summons_number                                int64
plate_id                                     object
registration_state                           object
plate_type                                   object
issue_date                           datetime64[ns]
violation_code                                int64
vehicle_body_type                            object
vehicle_make                                 object
issuing_agency                               object
street_code1                                  int64
street_code2                                  int64
street_code3                                  int64
vehicle_expiration_date                       int64
violation_location                          float64
violation_precinct                            int64
issuer_precinct                               int64
issuer_code                                   int64
issuer_command                               objec

# Advanced Cleaning

### ✅ 1. Import file

In [19]:
import pandas as pd
import numpy as np

df = pd.read_csv("cleaned_2014_sample.csv")

### ✅ 2. Fix Date and Time Columns

In [20]:
# Convert to datetime
df['issue_date'] = pd.to_datetime(df['issue_date'], errors='coerce')
df['violation_datetime'] = pd.to_datetime(df['violation_datetime'], errors='coerce')

### ✅ 3. Drop or Fill Missing Values

**3.1 Critical IDs — Drop rows if these are missing**

In [21]:
df = df.dropna(subset=['plate_id', 'street_name'])

**3.2 Optional Fields — Fill with 'Unknown' or '0' or mode**

In [22]:
df['vehicle_body_type'] = df['vehicle_body_type'].fillna('UNKNOWN')
df['vehicle_make'] = df['vehicle_make'].fillna('UNKNOWN')
df['violation_location'] = df['violation_location'].fillna(0)
df['violation_county'] = df['violation_county'].fillna('UNKNOWN')
df['violation_in_front_of_or_opposite'] = df['violation_in_front_of_or_opposite'].fillna('UNKNOWN')
df['house_number'] = df['house_number'].fillna('UNKNOWN')
df['sub_division'] = df['sub_division'].fillna('UNKNOWN')
df['vehicle_color'] = df['vehicle_color'].fillna('UNKNOWN')
df['meter_number'] = df['meter_number'].fillna('UNKNOWN')

**3.3 Time Columns — Drop rows with NaT or fill with mode**

In [23]:
df['violation_time'] = df['violation_time'].fillna('00:00A')  # Placeholder
df['violation_hour'] = df['violation_hour'].fillna(df['violation_hour'].mode()[0])
df['violation_minute'] = df['violation_minute'].fillna(df['violation_minute'].mode()[0])
df['violation_datetime'] = df['violation_datetime'].fillna(df['issue_date'])  # fallback


### ✅ 4. Fix Outliers

**4.1 Vehicle Year: Remove rows with year < 1980 or > current year**

In [24]:
current_year = pd.Timestamp.now().year
df = df[(df['vehicle_year'] >= 1980) & (df['vehicle_year'] <= current_year)]

**4.2 Violation Hour: Max = 23**

In [26]:
df = df[df['violation_hour'] <= 23]

### ✅ 5. Remove Duplicates

In [27]:
df = df.drop_duplicates(subset=['summons_number'])

### ✅ 6. Optional: Save Cleaned File

In [28]:
df.to_csv("cleaned_2014_sample_final.csv", index=False)