In [13]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, f1_score



In [8]:
df = pd.read_csv('PWV_processed.csv')
df.head(5)
df

Unnamed: 0,_id,case_no,status_dttm,status,code,value,description,violation_stno,violation_street,violation_city,violation_zip,ward,contact_addr1,sam_id,latitude,longitude
0,1,CE825063,2025-03-28 13:11:00,Open,3,100.0,overfilling of barrel/dumpster,3589,Washington,Jamaica Plain,2130.0,11.0,100 GRANDVIEW RD STE #207,144961.0,42.303162,-71.110952
1,2,CE825062,2025-03-28 13:08:00,Open,1,25.0,improper storage trash,59,Pleasant,Dorchester,2125.0,13.0,59 PLEASANT ST,355593.0,42.314967,-71.060221
2,3,CE825061,2025-03-28 13:03:00,Open,1,25.0,improper storage trash,50,Wenham,Jamaica Plain,2130.0,19.0,52 WENHAM ST,398565.0,42.297070,-71.114531
3,4,CE825060,2025-03-28 13:00:00,Open,1,25.0,improper storage trash,48,Wenham,Jamaica Plain,2130.0,19.0,48 WENHAM ST,147686.0,42.296960,-71.114611
4,5,CE825059,2025-03-28 12:56:00,Open,1,25.0,improper storage trash,28,Morrill,Dorchester,2125.0,13.0,"685 MAIN STREET, APT 28",97509.0,42.316300,-71.061031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533103,533758,CE262935,2015-10-12 09:01:00,Closed,1,25.0,improper storage trash,4,Foster,Boston,2109.0,3.0,14 HANOVER AV APT 6,59715.0,42.367320,-71.054500
533104,533759,CE262934,2015-10-12 08:54:00,Closed,1,25.0,improper storage trash,178,Salem,Boston,2113.0,3.0,12 LEDGEWOOD RD,121822.0,42.366520,-71.054780
533105,533760,CE262933,2015-10-12 08:48:00,Closed,1,25.0,improper storage trash,9,Hull,Boston,2113.0,3.0,72 MARGINAL ST,75942.0,42.366640,-71.055090
533106,533761,CE262932,2015-10-12 08:45:00,Closed,1,25.0,improper storage trash,10,Hull,Boston,2113.0,3.0,300 NORTH ST,75898.0,42.366480,-71.055180


In [31]:

# 1. Define the six target classes
keep = [
    "overgrown weeds on property",
    "failure to remove snow from sidewalk",
    "illegal dumping",
    "occupying city property without permit",
    "illegal parking property owner",
    "unregistered motor vehicles"
]

numeric_cols = df.select_dtypes(np.number).columns

non_numeric_cols = df.select_dtypes(np.object_).columns

# 2. Filter your DataFrame
df_filt = df[df['description'].isin(keep)].copy()

df_filt[numeric_cols].fillna(0)
df_filt[non_numeric_cols].fillna("")

for col in non_numeric_cols:
    df_filt[col] = df_filt[col].astype('category')

# 3. Select features (example: numeric columns)
X = df_filt.drop(columns=['description'], axis=1)[numeric_cols]
y = df_filt['description']

# 4. Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, stratify=y_enc, test_size=0.2, random_state=42
)


In [32]:
print(X_train.shape, X_test.shape)
print(len(y_train), len(y_test))

(91115, 8) (22779, 8)
91115 22779




**Why XGBoost?**

*XGBoost* handles mixed numeric data very well, captures nonlinear feature interactions, and often achieves strong accuracy out-of-the-box on tabular problems.

In [33]:
# train
xgb = XGBClassifier(
use_label_encoder=False,
    eval_metric='mlogloss',
    enable_categorical=True,   # <— allow pandas categories
    random_state=42
)
xgb.fit(X_train, y_train)

# predict & report
y_pred_xgb = xgb.predict(X_test)

print("=== XGBoost Classification Report ===")
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))

# Weighted F1
w_f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
print(f"Weighted F1 (XGBoost): {w_f1_xgb:.4f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== XGBoost Classification Report ===
                                        precision    recall  f1-score   support

  failure to remove snow from sidewalk       1.00      1.00      1.00      6622
                       illegal dumping       1.00      1.00      1.00      2944
        illegal parking property owner       1.00      1.00      1.00      1369
occupying city property without permit       1.00      1.00      1.00      1395
           overgrown weeds on property       1.00      1.00      1.00      9511
           unregistered motor vehicles       1.00      1.00      1.00       938

                              accuracy                           1.00     22779
                             macro avg       1.00      1.00      1.00     22779
                          weighted avg       1.00      1.00      1.00     22779

Weighted F1 (XGBoost): 1.0000


**Why Naïve Bayes?**

*Naïve Bayes* is extremely fast, simple to train, and serves as a solid baseline—despite its “feature-independence” assumption, it can perform surprisingly well when classes are reasonably separable.

In [34]:
# train
nb = GaussianNB()
nb.fit(X_train, y_train)

# predict & report
y_pred_nb = nb.predict(X_test)

print("=== Naïve Bayes Classification Report ===")
print(classification_report(y_test, y_pred_nb, target_names=le.classes_))

# Weighted F1
w_f1_nb = f1_score(y_test, y_pred_nb, average='weighted')
print(f"Weighted F1 (Naïve Bayes): {w_f1_nb:.4f}")



=== Naïve Bayes Classification Report ===
                                        precision    recall  f1-score   support

  failure to remove snow from sidewalk       0.93      0.88      0.91      6622
                       illegal dumping       0.98      1.00      0.99      2944
        illegal parking property owner       0.42      0.78      0.55      1369
occupying city property without permit       1.00      0.88      0.94      1395
           overgrown weeds on property       0.97      0.99      0.98      9511
           unregistered motor vehicles       0.32      0.01      0.01       938

                              accuracy                           0.90     22779
                             macro avg       0.77      0.76      0.73     22779
                          weighted avg       0.90      0.90      0.89     22779

Weighted F1 (Naïve Bayes): 0.8932
