## Re-Engineering Raw Crime Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
# set random seed
SEED = 321
np.random.seed(SEED)

In [3]:
# Set options
pd.set_option('display.max_columns',100)
# Customize figure style for stakeholder-facing visualizations
plt.style.use(('ggplot','fivethirtyeight'))
sns.set_context('notebook', font_scale=1.2)
plt.rcParams['figure.figsize'] = (12,4)
plt.rcParams['savefig.transparent'] = False
plt.rcParams['savefig.bbox'] = 'tight'


In [4]:
df = pd.read_csv("Data/Part_1_Crime_Data.csv",
                 low_memory=False)
## Converting crimedatetime and coercing errors
df['CrimeDateTime'] = pd.to_datetime(df['CrimeDateTime'], errors='coerce')
df = df.dropna(subset=['CrimeDateTime'])
## setting and sorting the index
df = df.set_index("CrimeDateTime")
df = df.sort_index()
df.head(3)


Unnamed: 0_level_0,RAPE,LARCENY,AGG. ASSAULT,LARCENY FROM AUTO,BURGLARY,COMMON ASSAULT,AUTO THEFT,ROBBERY,ROBBERY - COMMERCIAL,ARSON,ROBBERY - CARJACKING,HOMICIDE,SHOOTING
CrimeDateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-01 00:00:00+00:00,5.0,39,28,27,21,41,15,5,2,1.0,,,
2011-01-02 00:00:00+00:00,2.0,20,6,20,19,14,5,10,1,0.0,,,
2011-01-03 00:00:00+00:00,1.0,19,13,16,21,15,7,9,1,0.0,,,


In [5]:
## baltimore is in eastern time zone,but can just use tz-naive index by choosing None
df = df.tz_convert(None)
df.head(3)


Unnamed: 0_level_0,RAPE,LARCENY,AGG. ASSAULT,LARCENY FROM AUTO,BURGLARY,COMMON ASSAULT,AUTO THEFT,ROBBERY,ROBBERY - COMMERCIAL,ARSON,ROBBERY - CARJACKING,HOMICIDE,SHOOTING
CrimeDateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-01,5.0,39,28,27,21,41,15,5,2,1.0,,,
2011-01-02,2.0,20,6,20,19,14,5,10,1,0.0,,,
2011-01-03,1.0,19,13,16,21,15,7,9,1,0.0,,,


In [6]:
## Drop extra columns not needed
drop_cols = ['RowID','CCNO','Post','GeoLocation', 'Location','Premise']
df = df.drop(columns=drop_cols)
df.info()


KeyError: "['RowID', 'CCNO', 'Post', 'GeoLocation', 'Location', 'Premise'] not found in axis"