In [1]:
import requests
import pandas as pd
import numpy as np
import re
import string
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [2]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\surya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\surya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\surya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
API_URL = "https://data.cityofnewyork.us/resource/erm2-nwe9.json"
LIMIT = 10000  # safe size for NLP

params = {
    "$limit": LIMIT,
    "$select": "complaint_type,descriptor,agency"
}

response = requests.get(API_URL, params=params)
data = response.json()

df = pd.DataFrame(data)
df.head()


Unnamed: 0,complaint_type,descriptor,agency
0,LinkNYC,Damaged/Defective,DOITT
1,Sanitation Condition,12 Dead Animals,DSNY
2,Other Enforcement,E6 Commercial Waste Disposal,DSNY
3,Highway Condition,Crash Cushion Defect,DOT
4,Other Enforcement,E3B Sidewalk Obstruction,DSNY


In [4]:
df = df.dropna(subset=["complaint_type", "descriptor"])
df = df.reset_index(drop=True)

print("Total records:", len(df))
print(df["complaint_type"].value_counts().head())


Total records: 9585
complaint_type
Street Condition                       2502
Graffiti                               1204
Encampment                             1182
Request Large Bulky Item Collection     726
Derelict Vehicles                       386
Name: count, dtype: int64


In [5]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    
    return " ".join(words)

df["clean_text"] = df["descriptor"].apply(clean_text)
df[["descriptor", "clean_text"]].head()


Unnamed: 0,descriptor,clean_text
0,Damaged/Defective,damageddefective
1,12 Dead Animals,dead animal
2,E6 Commercial Waste Disposal,e commercial waste disposal
3,Crash Cushion Defect,crash cushion defect
4,E3B Sidewalk Obstruction,eb sidewalk obstruction


In [8]:
# Remove rare complaint types (less than 2 samples)
class_counts = df["complaint_type"].value_counts()
valid_classes = class_counts[class_counts >= 2].index

df = df[df["complaint_type"].isin(valid_classes)].reset_index(drop=True)

print("Remaining records:", len(df))
print("Remaining classes:", df["complaint_type"].nunique())


Remaining records: 9565
Remaining classes: 110


In [9]:
X = df["clean_text"]
y = df["complaint_type"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [10]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [11]:
model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

model.fit(X_train_tfidf, y_train)


In [12]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.9304756926293779

Classification Report:

                                         precision    recall  f1-score   support

                      Abandoned Vehicle       1.00      1.00      1.00         8
                         Adopt-A-Basket       1.00      1.00      1.00        10
            Animal Facility - No Permit       0.00      0.00      0.00         1
                           Animal-Abuse       0.00      0.00      0.00         1
                       BEST/Site Safety       0.00      0.00      0.00         1
                       Blocked Driveway       1.00      1.00      1.00        12
                       Bridge Condition       1.00      0.14      0.25         7
                   Broken Parking Meter       1.00      0.97      0.98        31
                           Building/Use       0.00      0.00      0.00         1
             Bus Stop Shelter Complaint       0.00      0.00      0.00         1
                           Damaged Tree       0.93    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
import shutil
import os

TARGET_DIR = r"E:\citysense360\models"
os.makedirs(TARGET_DIR, exist_ok=True)

shutil.move("complaint_classifier.pkl", os.path.join(TARGET_DIR, "complaint_classifier.pkl"))
shutil.move("tfidf_vectorizer.pkl", os.path.join(TARGET_DIR,"tfidf_vectorizer.pkl"))


Model and vectorizer saved successfully


In [14]:
def predict_complaint(text):
    clean = clean_text(text)
    vec = tfidf.transform([clean])
    return model.predict(vec)[0]

sample_text = "Street light is not working for the past two days"
print("Predicted Category:", predict_complaint(sample_text))


Predicted Category: Street Light Condition
