# NYC Crime Data Cleaning

In [1]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

In [2]:
path = 'C:/Users/Zaca/Documents/Datasets/nyc/'

In [3]:
# To save memory:
# I have already pre-preared a text file containing the columns I think are most relevant.
selected_cols = pd.read_csv(path + 'crime_selected_cols.txt')
selected_cols

Unnamed: 0,name,description,rename
0,CMPLNT_FR_DT,Exact date of occurrence for the reported event,date
1,CMPLNT_FR_TM,Exact time of occurrence for the reported event,time
2,ADDR_PCT_CD,The precinct in which the incident occurred,precinct
3,KY_CD,Three digit offense classification code,class_code
4,OFNS_DESC,Description of offense corresponding with key ...,description
5,LAW_CAT_CD,Level of offense: felony misdemeanor violation,level
6,BORO_NM,The name of the borough in which the incident ...,borough
7,PREM_TYP_DESC,Specific description of premises (grocery stor...,premises
8,Lat_Lon,Geospatial Location Point (latitude and Longit...,geo


In [4]:
crime = pd.read_csv(path + 'nypd_historic.csv', usecols=selected_cols['name'])

In [5]:
# change column names
crime.columns = selected_cols['rename']

In [6]:
# re-checking the size of our dataset
crime.shape

(6847944, 9)

In [7]:
# look at dtypes
crime.dtypes

rename
date            object
time            object
precinct       float64
class_code       int64
description     object
level           object
borough         object
premises        object
geo             object
dtype: object

In [8]:
# change to appropriate dtypes
crime.date = pd.to_datetime(crime.date, errors='coerce')

In [9]:
crime.time = pd.to_datetime(crime.time, errors='coerce').dt.hour

In [10]:
# I have tons of data, might as well just drop nas.
crime.isna().sum()
crime.dropna(inplace=True)

In [11]:
crime.time = crime.time.astype('int64')

In [12]:
# clean / filter data by complete years
crime = crime[(crime.date > '01-01-2007') & (crime.date < '01-01-2019')]

In [13]:
# transform precinct column to int
crime['precinct'] = crime.precinct.astype('int64')

In [14]:
# lets clean up the categorical data
# get top crimes
crime['description'].value_counts()[:25].sum()/crime.shape[0]

0.9845345980221958

In [15]:
crime.description.value_counts()[:60]

PETIT LARCENY                           984505
HARRASSMENT 2                           733522
ASSAULT 3 & RELATED OFFENSES            623550
CRIMINAL MISCHIEF & RELATED OF          594719
GRAND LARCENY                           505759
DANGEROUS DRUGS                         369783
OFF. AGNST PUB ORD SENSBLTY &           310818
FELONY ASSAULT                          227046
ROBBERY                                 216401
BURGLARY                                203927
MISCELLANEOUS PENAL LAW                 148146
DANGEROUS WEAPONS                       138989
OFFENSES AGAINST PUBLIC ADMINI          118389
GRAND LARCENY OF MOTOR VEHICLE          103253
INTOXICATED & IMPAIRED DRIVING           82341
VEHICLE AND TRAFFIC LAWS                 72680
CRIMINAL TRESPASS                        69732
SEX CRIMES                               66807
THEFT-FRAUD                              61160
FORGERY                                  58875
FRAUDS                                   37152
POSSESSION OF

In [16]:
crime.to_csv(path + 'nypd_historic_07-18.csv')