In [49]:
import pandas as pd
from csvvalidator import *

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [50]:
#Validate data
field_names = (
               'Ticket number',
               'Issue Date',
               'Latitude',
               'Longitude',
               'Issue time',
               'Violation Description'
               )
validator = CSVValidator(field_names)
# basic header and record length checks
validator.add_header_check('EX1', 'bad header')
validator.add_record_length_check('EX2', 'unexpected record length')

In [51]:
df = pd.read_csv('../parking_citation_2020_2022.csv')

  df = pd.read_csv('../parking_citation_2020_2022.csv')


In [52]:
#updating formatting so that I can translate issue date to datetime
df['Issue Date'] = df[df['Issue Date'].notnull()]['Issue Date'].apply(lambda x: x.split('T')[0])
df['Issue Date'] = pd.to_datetime(df['Issue Date'], infer_datetime_format=True)

In [53]:
#pad anything that is less than 4 digits then isolate just the hours
df['Issue time'] = df['Issue time'].astype(str)
df['Issue time'] = df['Issue time'].apply(lambda x: x.split('.')[0])
df['Issue time'] = df[df['Issue time'].notnull()]['Issue time'].apply(lambda x: x.zfill(4))
df['Issue Hour'] = df[df['Issue time']!='0nan']['Issue time'].apply(lambda x: DT.datetime.strptime(x,'%H%M').hour)

#clean lat lon
df['Latitude'] = np.where(df['Latitude']==99999.000, np.nan, df['Latitude'])
df['Longitude'] = np.where(df['Longitude']==99999.000, np.nan, df['Longitude'])

#string for ticketnum
df['Ticket number'] = df['Ticket number'].astype(str)

In [54]:
#Updating the Lat Lon
import pyproj
pm = '+proj=lcc +lat_1=34.03333333333333 +lat_2=35.46666666666667 +lat_0=33.5 +lon_0=-118 +x_0=2000000 +y_0=500000.0000000002 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs'
x1m,y1m = df['Latitude'].values, df['Longitude'].values
x2m,y2m = pyproj.transform(pyproj.Proj(pm,preserve_units = True), pyproj.Proj("+init=epsg:4326"), x1m,y1m)
df['Latitude']=x2m
df['Longitude']=y2m

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  x2m,y2m = pyproj.transform(pyproj.Proj(pm,preserve_units = True), pyproj.Proj("+init=epsg:4326"), x1m,y1m)


In [55]:
# df = pd.read_csv("../cleaned_2022_parking_citation.csv")

In [56]:
df.columns

Index(['Ticket number', 'Issue Date', 'Issue time', 'Meter Id', 'Marked Time',
       'RP State Plate', 'Plate Expiry Date', 'VIN', 'Make', 'Body Style',
       'Color', 'Location', 'Route', 'Agency', 'Violation code',
       'Violation Description', 'Fine amount', 'Latitude', 'Longitude',
       'Agency Description', 'Color Description', 'Body Style Description',
       'Issue Hour'],
      dtype='object')

In [57]:
df['Body Style Description'].value_counts(), df['Body Style'].value_counts()

(PASSENGER CAR    4806029
 PICK-UP TRUCK     165537
 VAN                91295
 COMMERCIAL         76050
 TRUCK              67966
 TRAILER            38114
 MOTOR HOME         20603
 BUS                 1680
 LIMOUSINE            239
 Name: Body Style Description, dtype: int64,
 PA    4806029
 PU     165537
 VN      91295
 TK      67966
 CM      64129
        ...   
 RB          1
 02          1
 FB          1
 RX          1
 AU          1
 Name: Body Style, Length: 133, dtype: int64)

In [58]:
df.shape

(5308775, 23)

In [63]:
data_df = df[[ 'RP State Plate', 'Make', 'Body Style', 'Color Description', 'Agency Description', 'Issue Hour', 'Violation Description']]
data_df.head()

Unnamed: 0,RP State Plate,Make,Body Style,Color Description,Agency Description,Issue Hour,Violation Description
0,CA,MAZD,PA,GREY,54 - DOT - HOLLYWOOD,13.0,WHITE ZONE
1,CA,LEXS,PA,BLACK,51 - DOT - WESTERN,16.0,
2,CA,MERZ,PA,BLACK,51 - DOT - WESTERN,16.0,
3,CA,PORS,PA,BLACK,55 - DOT - SOUTHERN,2.0,STANDNG IN ALLEY
4,CA,RROV,PA,GREY,51 - DOT - WESTERN,1.0,PARKED ON SIDEWALK


In [64]:
data_df = data_df[~data_df.isna().any(axis=1)]
data_df.shape

(4859213, 7)

In [46]:
from sklearn.cluster import KMeans

# Assuming your dataframe is named 'data' and has columns 'Latitude' and 'Longitude'
coordinates = data_df[['Latitude', 'Longitude']].values

# Create a KMeans clustering model with 100 clusters
kmeans = KMeans(n_clusters=500, random_state=0).fit(coordinates)

# Add cluster labels to the original dataframe
df['Cluster'] = kmeans.labels_

ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required by KMeans.

In [47]:
df.head()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Make,Body Style,...,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude,Agency Description,Color Description,Body Style Description,Issue Hour


In [13]:
# df.to_csv('../cleaned_2022_parking_citation2.csv', index=False)


In [44]:
# The label is imbalanced
df['Violation Description'].value_counts()

NO PARK/STREET CLEAN    1291019
METER EXP.               847889
RED ZONE                 561345
PREFERENTIAL PARKING     372800
DISPLAY OF TABS          299580
                         ...   
PK ON PRIV ST                 1
METER EXPIRED                 1
22500I                        1
REPAIRING VEH/STREET          1
8009H                         1
Name: Violation Description, Length: 88, dtype: int64

In [90]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import time
from sklearn.utils import class_weight

# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)


start_time = time.time()

# Check cuda
print(torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)


# 1. Data Preparation
# ---------------------
data = pd.read_csv("../cleaned_2022_parking_citation2.csv")
features = [ 'RP State Plate', 'Make', 'Body Style', 'Color Description', 'Agency Description', 'Issue Hour', 'Cluster']
X = data[features]

# One-hot encode all features
ohe = OneHotEncoder(sparse=False, dtype=int)
X_encoded = ohe.fit_transform(X)
encoded_columns = ohe.get_feature_names_out(features)  # Get column names for the encoded columns
X = pd.DataFrame(X_encoded, columns=encoded_columns)

y = LabelEncoder().fit_transform(data['Violation Description'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
batch_size = 256
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 2. Model Definition
# ---------------------
class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.layer3(x)
        return x

model = SimpleNN(X_train.shape[1], len(set(y_train))).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.002)

# 3. Training
# ------------
epochs = 10
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

    # 4. Evaluation
    # ---------------
    with torch.no_grad():
        y_pred = model(X_train_tensor)
        _, predicted = torch.max(y_pred, 1)
        accuracy = (predicted == y_train_tensor).sum().item() / len(y_train_tensor)
        print(f"Training Accuracy: {accuracy}")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Total execution time: {elapsed_time} seconds")

True
cuda:0




Epoch 1/10, Loss: 1.9293214082717896
Training Accuracy: 0.41698695931874435
Epoch 2/10, Loss: 1.6889859437942505
Training Accuracy: 0.42009175362271434
Epoch 3/10, Loss: 2.048356294631958
Training Accuracy: 0.42744758104085573
Epoch 4/10, Loss: 1.808008074760437
Training Accuracy: 0.4295431127539171
Epoch 5/10, Loss: 1.8518352508544922
Training Accuracy: 0.42972574771297417
Epoch 6/10, Loss: 1.8611969947814941
Training Accuracy: 0.43094013389595803
Epoch 7/10, Loss: 1.907226800918579
Training Accuracy: 0.43274058748487126
Epoch 8/10, Loss: 2.0424554347991943
Training Accuracy: 0.43207001733669165
Epoch 9/10, Loss: 1.5820761919021606
Training Accuracy: 0.43386570060951013
Epoch 10/10, Loss: 1.8171765804290771
Training Accuracy: 0.43538402407510385
Total execution time: 165.24951720237732 seconds


In [50]:
# import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, LabelEncoder
# import pandas as pd

# # Load your data
# data = pd.read_csv("cleaned_2022_parking_citation.csv")

# # Preprocess the data (similar to what we did above)
# data = data.dropna(subset=['Violation Description'])
# features = ['RP State Plate', 'Make', 'Body Style', 'Color Description', 'Agency Description', 'Issue Hour']
# X = data[features]
# X = X.fillna('Unknown')

# label_encoders = {}
# for col in X.select_dtypes(include=['object']).columns:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col])
#     label_encoders[col] = le

# scaler = StandardScaler()
# X[['Issue time', 'Issue Hour']] = scaler.fit_transform(X[['Issue time', 'Issue Hour']])
# y = LabelEncoder().fit_transform(data['Violation Description'])

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize and train the XGBoost classifier
# clf = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y_train)), random_state=42, use_label_encoder=False)
# clf.fit(X_train, y_train, eval_metric='mlogloss')

# # Get training accuracy (you can also get testing accuracy if you wish)
# train_accuracy = clf.score(X_train, y_train)
# print(f"Training Accuracy: {train_accuracy}")