In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import datetime as DT
import sys
from csvvalidator import *
import os

pd.set_option('display.float_format', lambda x: '%.3f' % x)



In [2]:
#Validate data
field_names = (
               'Ticket number',
               'Issue Date',
               'Latitude',
               'Longitude',
               'Issue time',
               'Violation Description'
               )
validator = CSVValidator(field_names)
# basic header and record length checks
validator.add_header_check('EX1', 'bad header')
validator.add_record_length_check('EX2', 'unexpected record length')

In [3]:
df.head()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Make,Body Style,...,Route,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude,Agency Description,Color Description,Body Style Description
0,4555162286,09/14/2022,1343,,0.0,CA,202305,,MAZD,PA,...,,54,80.56E1,WHITE ZONE,58,6458719.509,1859071.049,54 - DOT - HOLLYWOOD,GREY,PASSENGER CAR
1,1120740482,01/30/2022,1030,,,CA,202103,,RAM,TR,...,5A85,1,,,93,99999.0,99999.0,WESTERN,BLACK,TRAILER
2,1111081311,06/09/2022,808,,,CA,202101,,TOYO,SU,...,00561,55,8056E4,,93,6472422.966,1748169.243,55 - DOT - SOUTHERN,,
3,4555248633,09/06/2022,904,,0.0,CA,201910,,OTHR,PA,...,,51,5204A-,DISPLAY OF TABS,25,6454803.494,1840042.963,51 - DOT - WESTERN,TAN,PASSENGER CAR
4,4549605012,05/26/2022,1656,,0.0,CA,202211,,HOND,PA,...,0402A,54,80.56E4+,RED ZONE,93,6472951.712,1846223.696,54 - DOT - HOLLYWOOD,GREY,PASSENGER CAR


In [4]:
#violation Description
assert pd.api.types.is_string_dtype(df['Violation Description'])
print('Violation Description is OK')

#ticket number
assert pd.api.types.is_string_dtype(df['Ticket number'])
print('Ticket number is OK')

#Lat/Lon
assert pd.api.types.is_float_dtype(df['Latitude'])
print('Latitude is OK')
assert pd.api.types.is_float_dtype(df['Longitude'])
print('Longitude is OK')

#Issue Time
assert pd.api.types.is_integer_dtype(df['Issue time'])
print('Issue time is OK')

Violation Description is OK
Ticket number is OK
Latitude is OK
Longitude is OK
Issue time is OK


In [5]:
#updating formatting so that I can translate issue date to datetime
df['Issue Date'] = df[df['Issue Date'].notnull()]['Issue Date'].apply(lambda x: x.split('T')[0])
df['Issue Date'] = pd.to_datetime(df['Issue Date'], infer_datetime_format=True)

#limiting dataset so it's easy to work with
today = pd.Timestamp('today').normalize()
df.head()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Make,Body Style,...,Route,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude,Agency Description,Color Description,Body Style Description
0,4555162286,2022-09-14,1343,,0.0,CA,202305,,MAZD,PA,...,,54,80.56E1,WHITE ZONE,58,6458719.509,1859071.049,54 - DOT - HOLLYWOOD,GREY,PASSENGER CAR
1,1120740482,2022-01-30,1030,,,CA,202103,,RAM,TR,...,5A85,1,,,93,99999.0,99999.0,WESTERN,BLACK,TRAILER
2,1111081311,2022-06-09,808,,,CA,202101,,TOYO,SU,...,00561,55,8056E4,,93,6472422.966,1748169.243,55 - DOT - SOUTHERN,,
3,4555248633,2022-09-06,904,,0.0,CA,201910,,OTHR,PA,...,,51,5204A-,DISPLAY OF TABS,25,6454803.494,1840042.963,51 - DOT - WESTERN,TAN,PASSENGER CAR
4,4549605012,2022-05-26,1656,,0.0,CA,202211,,HOND,PA,...,0402A,54,80.56E4+,RED ZONE,93,6472951.712,1846223.696,54 - DOT - HOLLYWOOD,GREY,PASSENGER CAR


In [6]:
#pad anything that is less than 4 digits then isolate just the hours
df['Issue time'] = df['Issue time'].astype(str)
df['Issue time'] = df['Issue time'].apply(lambda x: x.split('.')[0])
df['Issue time'] = df[df['Issue time'].notnull()]['Issue time'].apply(lambda x: x.zfill(4))
df['Issue Hour'] = df[df['Issue time']!='0nan']['Issue time'].apply(lambda x: DT.datetime.strptime(x,'%H%M').hour)

#clean lat lon
df['Latitude'] = np.where(df['Latitude']==99999.000, np.nan, df['Latitude'])
df['Longitude'] = np.where(df['Longitude']==99999.000, np.nan, df['Longitude'])

#string for ticketnum
df['Ticket number'] = df['Ticket number'].astype(str)

In [7]:
#Updating the Lat Lon
import pyproj
pm = '+proj=lcc +lat_1=34.03333333333333 +lat_2=35.46666666666667 +lat_0=33.5 +lon_0=-118 +x_0=2000000 +y_0=500000.0000000002 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs'
x1m,y1m = df['Latitude'].values, df['Longitude'].values
x2m,y2m = pyproj.transform(pyproj.Proj(pm,preserve_units = True), pyproj.Proj("+init=epsg:4326"), x1m,y1m)
df['Latitude']=x2m
df['Longitude']=y2m

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  x2m,y2m = pyproj.transform(pyproj.Proj(pm,preserve_units = True), pyproj.Proj("+init=epsg:4326"), x1m,y1m)


In [3]:
# df.to_csv('cleaned_2022_parking_citation.csv', index=False)
df = pd.read_csv("cleaned_2022_parking_citation.csv")

  df = pd.read_csv("cleaned_2022_parking_citation.csv")


In [4]:
df.columns

Index(['Ticket number', 'Issue Date', 'Issue time', 'Meter Id', 'Marked Time',
       'RP State Plate', 'Plate Expiry Date', 'VIN', 'Make', 'Body Style',
       'Color', 'Location', 'Route', 'Agency', 'Violation code',
       'Violation Description', 'Fine amount', 'Latitude', 'Longitude',
       'Agency Description', 'Color Description', 'Body Style Description',
       'Issue Hour'],
      dtype='object')

In [6]:
df['Violation Description'].unique()

array(['WHITE ZONE', nan, 'DISPLAY OF TABS', 'RED ZONE', 'METER EXP.',
       'NO PARK/STREET CLEAN', 'DOUBLE PARKING', 'OVNIGHT PRK W/OUT PE',
       'DISPLAY OF PLATES', 'NO STOPPING/ANTI-GRIDLOCK ZONE',
       'PARKED OVER TIME LIMIT', 'NO STOP/STANDING', 'PARKED IN PARKWAY',
       'BLK BIKE PATH OR LANE', 'FIRE HYDRANT', 'YELLOW ZONE',
       'OFF STR/OVERTIME/MTR', 'PARKED ON SIDEWALK', 'BLOCKING DRIVEWAY',
       'EXCEED 72HRS-ST', 'PREFERENTIAL PARKING', 'NO STOP/STAND',
       '18 IN. CURB/1 WAY', 'NO PARKING', 'PARKED IN BUS ZONE',
       'STANDNG IN ALLEY', '18 IN. CURB/2 WAY', 'STOP/STAND PROHIBIT',
       'PRK IN ELEC VEH SPACE', 'DISABLED PARKING/NO DP ID',
       'COMM TRAILER/22 FT.', 'PRIVATE PROPERTY', 'WITHIN INTERSECTION',
       'ELECTRIC CHARGING STATION SPACES', 'COMM VEH OVER TIME LIMIT',
       'OUTSIDE LINES/METER', 'PK OVERSIZ',
       'DISABLED PARKING/CROSS HATCH', 'DP-BLKNG ACCESS RAMP',
       'PK TRAILER', 'RESTRICTED TAXI ZONE', '18 IN/CURB/COMM VEH',
 

In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd

# Load your data
data = pd.read_csv("cleaned_2022_parking_citation.csv")

# Preprocess the data (similar to what we did above)
data = data.dropna(subset=['Violation Description'])
features = ['Issue time', 'RP State Plate', 'Make', 'Body Style', 'Color Description', 'Agency Description', 'Issue Hour']
X = data[features]
X = X.fillna('Unknown')

label_encoders = {}
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

scaler = StandardScaler()
X[['Issue time', 'Issue Hour']] = scaler.fit_transform(X[['Issue time', 'Issue Hour']])
y = LabelEncoder().fit_transform(data['Violation Description'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost classifier
clf = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y_train)), random_state=42, use_label_encoder=False)
clf.fit(X_train, y_train, eval_metric='mlogloss')

# Get training accuracy (you can also get testing accuracy if you wish)
train_accuracy = clf.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy}")

  data = pd.read_csv("cleaned_2022_parking_citation.csv")


Training Accuracy: 0.2481629888819911


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 1. Data Preparation
# ---------------------
data = pd.read_csv("cleaned_2022_parking_citation.csv")
data = data.dropna(subset=['Violation Description'])
features = ['Issue time', 'RP State Plate', 'Make', 'Body Style', 'Color Description', 'Agency Description', 'Issue Hour']
X = data[features]
X = X.fillna('Unknown')

label_encoders = {}
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

scaler = StandardScaler()
X[['Issue time', 'Issue Hour']] = scaler.fit_transform(X[['Issue time', 'Issue Hour']])
y = LabelEncoder().fit_transform(data['Violation Description'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 2. Model Definition
# ---------------------
class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.layer3(x)
        return x

model = SimpleNN(X_train.shape[1], len(set(y_train)))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 3. Training
# ------------
epochs = 10
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# 4. Evaluation
# ---------------
with torch.no_grad():
    y_pred = model(X_train_tensor)
    _, predicted = torch.max(y_pred, 1)
    accuracy = (predicted == y_train_tensor).sum().item() / len(y_train_tensor)
    print(f"Training Accuracy: {accuracy}")



In [26]:
# import folium

# LA_COORDINATES = (34.05, -118.24)

# # create empty map zoomed in on San Francisco
# map = folium.Map(location=LA_COORDINATES, zoom_start=10) 

# # add a marker for every record in the filtered data, use a clustered view
# from folium.plugins import FastMarkerCluster
# FastMarkerCluster(data=list(zip(df[(df['Issue Date']>week_ago) & (df['Longitude'].notnull())]['Longitude'],(df[(df['Issue Date']>week_ago) & (df['Latitude'].notnull())]['Latitude'])))).add_to(map)

# folium.LayerControl().add_to(map)
    
# display(map)

In [27]:
# #plot out scatter (line) graph of number of tickets 

# df.set_index(df["Issue Date"],inplace=True)

# #Creating the plot
# data = [go.Scatter(x=df['Ticket number'].resample('D').count().truncate(before=month_ago).index, y=df['Ticket number'].resample('D').count().truncate(before=month_ago))]

# # specify the layout of our figure
# layout = dict(title = "Daily Number of Incidents",
#               xaxis= dict(title= 'Date',ticklen= 5,zeroline= False))

# # create and show our figure
# fig = dict(data = data, layout = layout)
# iplot(fig)

In [28]:
# #line chart of the counts by time during the last month
# data = [go.Scatter(x=df[['Ticket number','Issue Hour']].groupby('Issue Hour').count().index, y=df[['Ticket number','Issue Hour']].groupby('Issue Hour').count()['Ticket number'])]

# # specify the layout of our figure
# layout = dict(title = "Time of Incidents",
#               xaxis= dict(title= 'Hour',ticklen= 5,zeroline= False))

# # create and show our figure
# fig = dict(data = data, layout = layout)
# iplot(fig)

In [29]:
# #Top 10 reason codes during the last month
# data = [go.Bar(x=df.groupby('Violation Description')['Ticket number'].count().sort_values(ascending = False)[:10].index, y=df.groupby('Violation Description')['Ticket number'].count().sort_values(ascending = False)[:10])]

# # specify the layout of our figure
# layout = dict(title = "Violations by Reason",
#               xaxis= dict(ticklen= 5,zeroline= False))

# # create and show our figure
# fig = dict(data = data, layout = layout)
# iplot(fig)