In [10]:
import pandas as pd
from csvvalidator import *
import datetime as DT
import numpy as np

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [6]:
#Validate data
field_names = (
               'Ticket number',
               'Issue Date',
               'Latitude',
               'Longitude',
               'Issue time',
               'Violation Description'
               )
validator = CSVValidator(field_names)
# basic header and record length checks
validator.add_header_check('EX1', 'bad header')
validator.add_record_length_check('EX2', 'unexpected record length')

In [7]:
df = pd.read_csv('../parking_citation_2020_2022.csv')

  df = pd.read_csv('../parking_citation_2020_2022.csv')


In [8]:
#updating formatting so that I can translate issue date to datetime
df['Issue Date'] = df[df['Issue Date'].notnull()]['Issue Date'].apply(lambda x: x.split('T')[0])
df['Issue Date'] = pd.to_datetime(df['Issue Date'], infer_datetime_format=True)

In [11]:
#pad anything that is less than 4 digits then isolate just the hours
df['Issue time'] = df['Issue time'].astype(str)
df['Issue time'] = df['Issue time'].apply(lambda x: x.split('.')[0])
df['Issue time'] = df[df['Issue time'].notnull()]['Issue time'].apply(lambda x: x.zfill(4))
df['Issue Hour'] = df[df['Issue time']!='0nan']['Issue time'].apply(lambda x: DT.datetime.strptime(x,'%H%M').hour)

#clean lat lon
df['Latitude'] = np.where(df['Latitude']==99999.000, np.nan, df['Latitude'])
df['Longitude'] = np.where(df['Longitude']==99999.000, np.nan, df['Longitude'])

#string for ticketnum
df['Ticket number'] = df['Ticket number'].astype(str)

In [12]:
#Updating the Lat Lon
import pyproj
pm = '+proj=lcc +lat_1=34.03333333333333 +lat_2=35.46666666666667 +lat_0=33.5 +lon_0=-118 +x_0=2000000 +y_0=500000.0000000002 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs'
x1m,y1m = df['Latitude'].values, df['Longitude'].values
x2m,y2m = pyproj.transform(pyproj.Proj(pm,preserve_units = True), pyproj.Proj("+init=epsg:4326"), x1m,y1m)
df['Latitude']=x2m
df['Longitude']=y2m

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  x2m,y2m = pyproj.transform(pyproj.Proj(pm,preserve_units = True), pyproj.Proj("+init=epsg:4326"), x1m,y1m)


In [None]:
# df = pd.read_csv("../cleaned_2022_parking_citation.csv")

In [13]:
df.columns

Index(['Ticket number', 'Issue Date', 'Issue time', 'Meter Id', 'Marked Time',
       'RP State Plate', 'Plate Expiry Date', 'VIN', 'Make', 'Body Style',
       'Color', 'Location', 'Route', 'Agency', 'Violation code',
       'Violation Description', 'Fine amount', 'Latitude', 'Longitude',
       'Agency Description', 'Color Description', 'Body Style Description',
       'Issue Hour'],
      dtype='object')

In [14]:
df['Body Style Description'].value_counts(), df['Body Style'].value_counts()

(PASSENGER CAR    4806029
 PICK-UP TRUCK     165537
 VAN                91295
 COMMERCIAL         76050
 TRUCK              67966
 TRAILER            38114
 MOTOR HOME         20603
 BUS                 1680
 LIMOUSINE            239
 Name: Body Style Description, dtype: int64,
 PA    4806029
 PU     165537
 VN      91295
 TK      67966
 CM      64129
        ...   
 RB          1
 02          1
 FB          1
 RX          1
 AU          1
 Name: Body Style, Length: 133, dtype: int64)

In [15]:
df.shape

(5308775, 23)

In [16]:
data_df = df[[ 'RP State Plate', 'Make', 'Body Style', 'Color Description', 'Agency Description', 'Issue Hour', 'Latitude', 'Longitude', 'Violation Description']]
data_df.head()

Unnamed: 0,RP State Plate,Make,Body Style,Color Description,Agency Description,Issue Hour,Latitude,Longitude,Violation Description
0,CA,MAZD,PA,GREY,54 - DOT - HOLLYWOOD,13.0,-118.34,34.1,WHITE ZONE
1,CA,LEXS,PA,BLACK,51 - DOT - WESTERN,16.0,-118.471,33.984,
2,CA,MERZ,PA,BLACK,51 - DOT - WESTERN,16.0,-118.354,34.062,
3,CA,PORS,PA,BLACK,55 - DOT - SOUTHERN,2.0,-118.335,34.033,STANDNG IN ALLEY
4,CA,RROV,PA,GREY,51 - DOT - WESTERN,1.0,-118.396,34.027,PARKED ON SIDEWALK


In [17]:
data_df = data_df[~data_df.isna().any(axis=1)]
data_df.shape

(4570474, 9)

In [18]:
from sklearn.cluster import KMeans

# Assuming your dataframe is named 'data' and has columns 'Latitude' and 'Longitude'
coordinates = data_df[['Latitude', 'Longitude']].values

# Create a KMeans clustering model with 100 clusters
kmeans = KMeans(n_clusters=500, random_state=0).fit(coordinates)

# Add cluster labels to the original dataframe
data_df['Cluster'] = kmeans.labels_



KeyboardInterrupt: 

In [None]:
data_df.head()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Make,Body Style,...,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude,Agency Description,Color Description,Body Style Description,Issue Hour


In [19]:
# data_df.to_csv('../cleaned_2020_2022_parking_citation.csv', index=False)
data_df = pd.read_csv('../cleaned_2020_2022_parking_citation.csv')
data_df.head()

Unnamed: 0,RP State Plate,Make,Body Style,Color Description,Agency Description,Issue Hour,Latitude,Longitude,Violation Description,Cluster
0,CA,MAZD,PA,GREY,54 - DOT - HOLLYWOOD,13.0,-118.34,34.1,WHITE ZONE,274
1,CA,PORS,PA,BLACK,55 - DOT - SOUTHERN,2.0,-118.335,34.033,STANDNG IN ALLEY,27
2,CA,RROV,PA,GREY,51 - DOT - WESTERN,1.0,-118.396,34.027,PARKED ON SIDEWALK,231
3,CA,VOLK,PA,GREY,56 - DOT - CENTRAL,1.0,-118.213,34.043,RED ZONE,67
4,CA,HOND,PA,BLACK,53 - DOT - VALLEY,2.0,-118.446,34.229,PREFERENTIAL PARKING,399


In [20]:
# The label is imbalanced
data_df['Violation Description'].value_counts()

NO PARK/STREET CLEAN    1193214
METER EXP.               831204
RED ZONE                 525621
PREFERENTIAL PARKING     360718
DISPLAY OF TABS          276368
                         ...   
PARKNG IN TUNNEL              1
PK ON PRIV ST                 1
REPAIRING VEH/STREET          1
OVERNIGHT PARKING             1
ON RR TRACK/7.5 FT            1
Name: Violation Description, Length: 85, dtype: int64

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import precision_score, recall_score
# from sklearn.utils import class_weight
import time

start_time = time.time()

# Check cuda
print(torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)


# 1. Data Preparation
# ---------------------
data = pd.read_csv("../cleaned_2020_2022_parking_citation.csv")
features = [ 'RP State Plate', 'Make', 'Body Style', 'Color Description', 'Agency Description', 'Issue Hour', 'Cluster']
X = data[features]

# One-hot encode all features
ohe = OneHotEncoder(sparse=False, dtype=int)
X_encoded = ohe.fit_transform(X)
encoded_columns = ohe.get_feature_names_out(features)  # Get column names for the encoded columns
X = pd.DataFrame(X_encoded, columns=encoded_columns)

# Extract target variable and encode it
y = LabelEncoder().fit_transform(data['Violation Description'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Now, after the split, calculate the class weights
# class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
batch_size = 256
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 2. Model Definition
# ---------------------
class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)  # New layer
        self.layer4 = nn.Linear(32, output_dim)  # Modified from layer3 to layer4

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = torch.relu(self.layer3(x))  # Activation for the new layer
        x = self.layer4(x)  # Final layer
        return x

model = SimpleNN(X_train.shape[1], len(set(y_train))).to(device)
# criterion = nn.CrossEntropyLoss(weight=class_weights)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Initializing lists to store precision and recall values for each epoch
epoch_precisions = []
epoch_recalls = []
epochs = 10
# 3. Training
# ------------
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

    # 4. Evaluation
    # ---------------
    with torch.no_grad():
        y_pred = model(X_train_tensor)
        _, predicted = torch.max(y_pred, 1)
        
        # Convert tensors to numpy arrays for sklearn metrics
        true_labels = y_train_tensor.cpu().numpy()
        pred_labels = predicted.cpu().numpy()
        
        # Calculate precision and recall
        precision = precision_score(true_labels, pred_labels, average='macro')
        recall = recall_score(true_labels, pred_labels, average='macro')
        epoch_precisions.append(precision)
        epoch_recalls.append(recall)

        accuracy = (predicted == y_train_tensor).sum().item() / len(y_train_tensor)
        print(f"Training Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Total execution time: {elapsed_time:.2f} seconds")

True
cuda:0




Epoch 1/10, Loss: 2.056368350982666


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4677, Precision: 0.1139, Recall: 0.0820
Epoch 2/10, Loss: 1.9851702451705933


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4759, Precision: 0.1453, Recall: 0.0965
Epoch 3/10, Loss: 3.128213882446289


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4795, Precision: 0.1472, Recall: 0.0981
Epoch 4/10, Loss: 2.023550271987915


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4786, Precision: 0.1483, Recall: 0.0996
Epoch 5/10, Loss: 2.3885669708251953


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4815, Precision: 0.1482, Recall: 0.0982
Epoch 6/10, Loss: 2.1242730617523193


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4803, Precision: 0.1527, Recall: 0.0955
Epoch 7/10, Loss: 1.4518883228302002


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4830, Precision: 0.1457, Recall: 0.1003
Epoch 8/10, Loss: 2.7058959007263184


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4823, Precision: 0.1532, Recall: 0.0995
Epoch 9/10, Loss: 2.7009828090667725


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4835, Precision: 0.1734, Recall: 0.1033
Epoch 10/10, Loss: 2.162100315093994


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.4829, Precision: 0.1758, Recall: 0.1015
Total execution time: 178.37 seconds


In [50]:
# import xgboost as xgb
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, LabelEncoder
# import pandas as pd

# # Load your data
# data = pd.read_csv("cleaned_2022_parking_citation.csv")

# # Preprocess the data (similar to what we did above)
# data = data.dropna(subset=['Violation Description'])
# features = ['RP State Plate', 'Make', 'Body Style', 'Color Description', 'Agency Description', 'Issue Hour']
# X = data[features]
# X = X.fillna('Unknown')

# label_encoders = {}
# for col in X.select_dtypes(include=['object']).columns:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col])
#     label_encoders[col] = le

# scaler = StandardScaler()
# X[['Issue time', 'Issue Hour']] = scaler.fit_transform(X[['Issue time', 'Issue Hour']])
# y = LabelEncoder().fit_transform(data['Violation Description'])

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize and train the XGBoost classifier
# clf = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y_train)), random_state=42, use_label_encoder=False)
# clf.fit(X_train, y_train, eval_metric='mlogloss')

# # Get training accuracy (you can also get testing accuracy if you wish)
# train_accuracy = clf.score(X_train, y_train)
# print(f"Training Accuracy: {train_accuracy}")