In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [2]:
crime_data=pd.read_csv("/Users/yani/Desktop/ML_Project_Yani/Crime_Data.csv")

In [5]:
crime_data.columns

Index(['RecordID', 'Offense', 'IncidentID', 'BlockNumber', 'StreetName',
       'Agency', 'DateReported', 'HourReported', 'ReportingOfficer'],
      dtype='object')

Developing a Category Map to Assign Offense to Overarching Topic

In [18]:
category_map = {
    'Assault & Violent Crime': ['assault', 'battery', 'homicide', 'murder', 'rape', 'sex offense', 'threats', 'robbery','harassment','stalking','kidnap','prowler','extortion'],
    'Theft & Larceny': ['larceny', 'shoplifting', 'stolen', 'theft', 'embezzlement','burglary'],
    'Property Damage': ['vandalism', 'graffiti', 'damage', 'arson'],
    'Narcotics/Drug Offenses': ['narcotics', 'drug', 'marijuana', 'possession', 'distribute','driving under the influence','towed vehicle'],
    'Motor Vehicle/Traffic': ['traffic', 'dui', 'dwi', 'reckless', 'unlicensed', 'accident','hit and run'],
    'Weapon Offenses': ['weapon', 'firearm', 'gun', 'shots fired', 'illegal hunting'],
    'Public Order/Disorderly': ['disorderly', 'public intoxication', 'trespassing', 'loitering', 'noise','domestic disturbance','disturbarance - non domestic','trespass','prostitution'],
    'Fraud & Financial': ['fraud', 'forgery', 'counterfeit', 'scam', 'financial','false report of crime'],
    'Mental Health/Welfare': ['mental', 'tdo', 'eco', 'suicide', 'assist citizen','missing person','runaway','crisis assessment','juvenile investigation']
}

In [19]:
def assign_category(offense_type):
    """
    Checks the offense type against defined keywords and assigns a category.
    Returns 'Other' if no match is found.
    """
    # Convert to lowercase for case-insensitive matching
    offense_lower = offense_type.lower()
    
    # Iterate through categories and their keywords
    for category, keywords in category_map.items():
        if any(keyword in offense_lower for keyword in keywords):
            return category
    
    return 'Other'

In [20]:
crime_data['Offense_cat'] = crime_data['Offense'].apply(assign_category)

print(crime_data[['RecordID', 'Offense', 'Offense_cat']])

       RecordID                          Offense              Offense_cat
0             1  Assist Citizen - Mental/TDO/ECO    Mental Health/Welfare
1             2      Shots Fired/Illegal Hunting          Weapon Offenses
2             3    Sex Offense - Forcible Sodomy  Assault & Violent Crime
3             4                        Vandalism          Property Damage
4             5            Larceny - Shoplifitng          Theft & Larceny
...         ...                              ...                      ...
25507     25508                      Hit and Run    Motor Vehicle/Traffic
25508     25509                        Narcotics  Narcotics/Drug Offenses
25509     25510      Shots Fired/Illegal Hunting          Weapon Offenses
25510     25511  Assist Citizen - Mental/TDO/ECO    Mental Health/Welfare
25511     25512             Robbery - Strong Arm  Assault & Violent Crime

[25512 rows x 3 columns]


In [21]:
other_offenses=crime_data[crime_data["Offense_cat"]=="Other"]

In [22]:
other_offenses.count()

RecordID            5541
Offense             5541
IncidentID          5541
BlockNumber         5173
StreetName          5541
Agency              5541
DateReported        5541
HourReported        5541
ReportingOfficer    5538
Offense_cat         5541
dtype: int64

In [23]:
unique_other_offenses = other_offenses['Offense'].unique()
print(unique_other_offenses)

['Lost/FoundProperty' 'Warrant Service' 'Animal Complaint'
 'Suspicious Activity' 'Embezzelment' 'Misc - Criminal Call'
 'Assist Agency - Arrest/warrant' 'Death Investigation - DOA'
 'Pornography' 'Drunkeness DIP' 'Comm Relations Initiative - CRI'
 'Assist Agency - Backup/Assist' 'Crime Prevent Initiative - CPI'
 'Disturbance - Non Domestic' 'Family Offense - Non-Violent'
 'Animal - Bite' 'Phone Calls - Threat or Obscene' 'Assist Agency - Other'
 'Misc - Non-Criminal Call' 'Unauthorized Use of Motor Veh'
 'Suspicious Vehicle' 'Suspicious Person' 'Problem Solving Project - PSP'
 'Phone Calls - Annoying' 'Bomb Threat' 'Fireworks Violation'
 'Child Ab - Contr to Delinquency' 'Impersonating a Police Officer'
 'Littering/Illegal Dumping' 'Computer Crime' 'Alarm - True'
 'Fire Code Violation' '911 Investigation' 'Liquor Law Violation'
 'Tactical Crime Initiative - TCI' 'Stop w/o Search/Frisk'
 'COVID-19 Tracking']


In [24]:
crime_data.to_csv('crime_data.csv', index=False)

In [25]:
import os
print(os.getcwd())

/Users/yani/Desktop/ML_Project_Yani
