In [1]:
# Dependencies
import pandas as pd
from pathlib import Path

In [4]:
# Load and Read in data
crime_file = Path("Denver_Crime_Data_Source_File.csv")
crime_file = pd.read_csv(crime_file, encoding='unicode_escape')
crime_file.head()

Unnamed: 0,incident_id,offense_id,offense_code,offense_code_extension,offense_type_id,offense_category_id,first_occurrence_date,last_occurrence_date,reported_date,incident_address,geo_x,geo_y,geo_lon,geo_lat,district_id,precinct_id,neighborhood_id,is_crime,is_traffic,victim_count
0,202268791,202268791299900,2999,0,criminal-mischief-other,public-disorder,2/10/2022 2:50:00 AM,,2/10/2022 3:16:00 AM,1107 N SANTA FE DR,3140929.0,1692612.0,-104.99891,39.733957,1,123,lincoln-park,1,0,1
1,2021387586,2021387586299900,2999,0,criminal-mischief-other,public-disorder,7/7/2021 9:02:00 PM,,7/8/2021 12:55:00 AM,815 16TH ST,3142470.0,1697098.0,-104.993342,39.746248,6,611,cbd,1,0,1
2,2020641486,2020641486299900,2999,0,criminal-mischief-other,public-disorder,10/29/2020 1:30:00 AM,,10/29/2020 4:31:00 AM,4745 N FEDERAL BLVD,3133352.0,1710396.0,-105.02552,39.782888,1,111,berkeley,1,0,1
3,2018612468,2018612468299900,2999,0,criminal-mischief-other,public-disorder,9/6/2018 5:00:00 PM,9/6/2018 11:00:00 PM,9/7/2018 9:58:00 AM,65 S FEDERAL BLVD,3133534.0,1685797.0,-105.02533,39.715357,4,411,barnum,1,0,1
4,2020293614,2020293614299900,2999,0,criminal-mischief-other,public-disorder,5/8/2020 5:00:00 AM,5/8/2020 6:30:00 PM,5/13/2020 10:00:00 AM,12295 E ALBROOK DR,3184065.0,1710782.0,-104.845074,39.783082,5,521,montbello,1,0,1


In [13]:
# Reduce the DataFrame to necessary columns
reduced_crime_data_df = crime_file[['first_occurrence_date', 'offense_category_id']]
reduced_crime_data_df.info()
reduced_crime_data_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386865 entries, 0 to 386864
Data columns (total 2 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   first_occurrence_date  386865 non-null  object
 1   offense_category_id    386865 non-null  object
dtypes: object(2)
memory usage: 5.9+ MB


Unnamed: 0,first_occurrence_date,offense_category_id
0,2/10/2022 2:50:00 AM,public-disorder
1,7/7/2021 9:02:00 PM,public-disorder
2,10/29/2020 1:30:00 AM,public-disorder
3,9/6/2018 5:00:00 PM,public-disorder
4,5/8/2020 5:00:00 AM,public-disorder


In [15]:
# Rename columns for readability
cleaned_crime_data_df = reduced_crime_data_df.rename(columns={ 
                                              'offense_category_id': 'Offense', 
                                              'first_occurrence_date': 'Date'}) 

# Fill empty Incident Address fields with 'missing'
cleaned_crime_data_df.fillna('missing', inplace=True)

# Change Occurance Date to datetime format
cleaned_crime_data_df['Date'] = pd.to_datetime(cleaned_crime_data_df['Date']).dt.date

# Display DataFrame info and first 5 rows
cleaned_crime_data_df.info()
cleaned_crime_data_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386865 entries, 0 to 386864
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Date     386865 non-null  object
 1   Offense  386865 non-null  object
dtypes: object(2)
memory usage: 5.9+ MB


Unnamed: 0,Date,Offense
0,2022-02-10,public-disorder
1,2021-07-07,public-disorder
2,2020-10-29,public-disorder
3,2018-09-06,public-disorder
4,2020-05-08,public-disorder


In [8]:
cleaned_crime_data_df['Offense Category'].unique()

array(['public-disorder', 'drug-alcohol', 'sexual-assault',
       'other-crimes-against-persons', 'all-other-crimes',
       'white-collar-crime', 'murder', 'robbery', 'aggravated-assault',
       'arson', 'burglary', 'larceny', 'theft-from-motor-vehicle',
       'auto-theft'], dtype=object)

In [16]:
# Sort the data by Occurrance Date
sorted_crime_data_df = cleaned_crime_data_df.sort_values(by='Date', ascending=True)

# Reset Index
sorted_crime_data_df = sorted_crime_data_df.reset_index(drop=True)
sorted_crime_data_df.head(-1)

Unnamed: 0,Date,Offense
0,2018-01-02,larceny
1,2018-01-02,public-disorder
2,2018-01-02,larceny
3,2018-01-02,drug-alcohol
4,2018-01-02,all-other-crimes
...,...,...
386859,2023-10-03,public-disorder
386860,2023-10-03,public-disorder
386861,2023-10-03,aggravated-assault
386862,2023-10-03,aggravated-assault


In [17]:
# Export to CSV
sorted_crime_data_df.to_csv('denver_crime_data_export.csv', index=False, header=True)