In [None]:
#### Import the libraries needed
import pickle
import dill
import json
import glob
import os
from pathlib import Path
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy import stats

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## **Load Data**

In [None]:
# Check jan2010_may2018.csv, jun2018_dec2018.csv and jan2019_jul2023.csv
df_jan2010_may2018 = pd.read_csv('data/jan2010_may2018.csv', dtype={'Beat': str, 'Block Range':str})
df_jun2018_dec2018 = pd.read_csv('data/jun2018_dec2018.csv', dtype={'Beat': str, 'Block Range':str})
df_jan2019_jul2023 = pd.read_csv('data/jan2019_jul2023.csv', dtype={'Beat': str})

## **Clean df_jan2010_may2018**

In [None]:
df_jan2010_may2018.head()

Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,Block Range,Street Name,Type,Suffix,Offenses,BlockRange,StreetName,# offenses,# Of Offenses,# Offenses,# Of
0,01/16/2017,5,Theft,10H10,Commercial Parking Lot or Garage,6000-6099,CANAL,ST,-,1.0,,,,,,
1,03/31/2017,18,Rape,10H10,Residence or House,200-299,N LENOX ST,ST,-,1.0,,,,,,
2,09/03/2017,19,Burglary,10H10,Apartment,1900-1999,RUNNELS,-,-,1.0,,,,,,
3,09/03/2017,21,Aggravated Assault,10H10,"Road, Street, or Sidewalk",2000-2099,RUNNELS,-,-,1.0,,,,,,
4,09/04/2017,13,Aggravated Assault,10H10,"Road, Street, or Sidewalk",400-499,LOCKWOOD DR,-,-,1.0,,,,,,


#### Restore Point

In [None]:
temp = df_jan2010_may2018.copy()

#### Copy Values from One Column to Another Column
    * 'BlockRange' --> 'Block Range'
    * 'StreetName' --> 'Street Name'
    * '# offenses' --> 'Offenses'
    * '# Of Offenses' --> 'Offenses'
    * '# Offenses' --> 'Offenses'
    * '# Of' --> 'Offenses'

In [None]:
# Check column header
cols_jan2010_may2018 = df_jan2010_may2018.columns
cols_jan2010_may2018

Index(['Date', 'Hour', 'Offense Type', 'Beat', 'Premise', 'Block Range',
       'Street Name', 'Type', 'Suffix', 'Offenses', 'BlockRange', 'StreetName',
       '# offenses', '# Of Offenses', '# Offenses', '# Of'],
      dtype='object')

In [None]:
# Copy values in 'BlockRange' to 'Block Range'
idx = df_jan2010_may2018['Block Range'].isnull() & ~df_jan2010_may2018['BlockRange'].isnull()
df_jan2010_may2018['Block Range'] = [df_jan2010_may2018['BlockRange'][i] if idx[i] else df_jan2010_may2018['Block Range'][i] for i in range(len(df_jan2010_may2018))]

In [None]:
# Copy values in 'StreetName' to 'Street Name'
idx = df_jan2010_may2018['Street Name'].isnull() & ~df_jan2010_may2018['StreetName'].isnull()
temp = df_jan2010_may2018['Street Name']
df_jan2010_may2018['Street Name'] = [df_jan2010_may2018['StreetName'][i] if idx[i] else df_jan2010_may2018['Street Name'][i] for i in range(len(df_jan2010_may2018))]

In [None]:
# Copy values in '# offenses' to 'Offenses'
idx = df_jan2010_may2018['Offenses'].isnull() & ~df_jan2010_may2018['# offenses'].isnull()
temp = df_jan2010_may2018['Offenses']
df_jan2010_may2018['Offenses'] = [df_jan2010_may2018['# offenses'][i] if idx[i] else df_jan2010_may2018['Offenses'][i] for i in range(len(df_jan2010_may2018))]

In [None]:
# Copy values in '# Of Offenses' to 'Offenses'
idx = df_jan2010_may2018['Offenses'].isnull() & ~df_jan2010_may2018['# Of Offenses'].isnull()
df_jan2010_may2018['Offenses'] = [df_jan2010_may2018['# Of Offenses'][i] if idx[i] else df_jan2010_may2018['Offenses'][i] for i in range(len(df_jan2010_may2018))]

In [None]:
# Copy values in '# Offenses' to 'Offenses'
idx = df_jan2010_may2018['Offenses'].isnull() & ~df_jan2010_may2018['# offenses'].isnull()
df_jan2010_may2018['Offenses'] = [df_jan2010_may2018['# offenses'][i] if idx[i] else df_jan2010_may2018['Offenses'][i] for i in range(len(df_jan2010_may2018))]

In [None]:
# Copy values in '# Of' to 'Offenses'
idx = df_jan2010_may2018['Offenses'].isnull() & ~df_jan2010_may2018['# Of'].isnull()
df_jan2010_may2018['Offenses'] = [df_jan2010_may2018['# Of'][i] if idx[i] else df_jan2010_may2018['Offenses'][i] for i in range(len(df_jan2010_may2018))]

In [None]:
# Check df_jan2010_may2018
df_jan2010_may2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1053346 entries, 0 to 1053345
Data columns (total 16 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   Date           1053341 non-null  object 
 1   Hour           1053345 non-null  object 
 2   Offense Type   1053345 non-null  object 
 3   Beat           1053329 non-null  object 
 4   Premise        1019749 non-null  object 
 5   Block Range    1053188 non-null  object 
 6   Street Name    1053344 non-null  object 
 7   Type           1053345 non-null  object 
 8   Suffix         1053345 non-null  object 
 9   Offenses       1043026 non-null  float64
 10  BlockRange     334684 non-null   object 
 11  StreetName     334684 non-null   object 
 12  # offenses     334684 non-null   float64
 13  # Of Offenses  574365 non-null   float64
 14  # Offenses     10319 non-null    float64
 15  # Of           7613 non-null     float64
dtypes: float64(5), object(11)
memory usage: 128.6+ MB


#### Remove Duplicates and Drop Unnecesarry Columns
    * 'BlockRange', 'StreetName', '# offenses', '# Of Offenses', '# Offenses', '# Of' --> Will be dropped

In [None]:
# Check duplicates
print(f"df_jan2010_may2018 contains {df_jan2010_may2018.duplicated().sum()} duplicates")
# Check all cuplicates
duplicates = df_jan2010_may2018.duplicated(keep=False)
df_jan2010_may2018[duplicates].sort_values('Date')

df_jan2010_may2018 contains 3560 duplicates


Unnamed: 0,Date,Hour,Offense Type,Beat,Premise,Block Range,Street Name,Type,Suffix,Offenses,BlockRange,StreetName,# offenses,# Of Offenses,# Offenses,# Of
730092,01/21/2018,23,Rape,19G50,Apartment,9800-9899,FORUM PARK,DR,-,1.0,,,,,,
467651,01/21/2018,23,Rape,19G50,Apartment,9800-9899,FORUM PARK,DR,-,1.0,,,,,,
905,06/20/2017,18,Rape,12D20,"Road, Street, or Sidewalk",10600-10699,SABO RD,RD,-,1.0,,,,,,
635236,06/20/2017,18,Rape,12D20,"Road, Street, or Sidewalk",10600-10699,SABO RD,RD,-,1.0,,,,,,
556335,07/04/2017,20,Robbery,1A30,Drug Store or Medical Supply,1000-1099,RICHMOND,AVE,-,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282331,2016-12-19 00:00:00,13,Theft,21I10,Airport Terminal,2800-2899,TERMINAL,RD,N,1.0,2800-2899,TERMINAL,1.0,,,
364321,2017-02-26 00:00:00,17,Theft,20G60,Apartment,14800-14899,GRISBY,-,-,1.0,14800-14899,GRISBY,1.0,,,
721922,2017-02-26 00:00:00,17,Theft,20G60,Apartment,14800-14899,GRISBY,-,-,1.0,14800-14899,GRISBY,1.0,,,
108008,2017-03-03 00:00:00,6,Theft,1A40,Apartment,2400-2499,WESTHEIMER,RD,-,1.0,2400-2499,WESTHEIMER,1.0,,,


In [None]:
# Remove duplicates
cleaned_df_jan2010_may2018 = df_jan2010_may2018.drop_duplicates()

# Drop 'BlockRange', 'StreetName', '# offenses', '# Of Offenses', '# Offenses', '# Of'
drop_cols = ['BlockRange', 'StreetName', '# offenses', '# Of Offenses', '# Offenses', '# Of']
cleaned_df_jan2010_may2018 = df_jan2010_may2018.drop(drop_cols, axis=1)

In [None]:
# Check column header
cleaned_df_jan2010_may2018.columns

Index(['Date', 'Hour', 'Offense Type', 'Beat', 'Premise', 'Block Range',
       'Street Name', 'Type', 'Suffix', 'Offenses'],
      dtype='object')

#### Extract Rows between 2010-01-01 and 2018-05-31

In [None]:
# Check the date range
cleaned_df_jan2010_may2018['Date'] = pd.to_datetime(cleaned_df_jan2010_may2018['Date'])
print(f"Start Date: {cleaned_df_jan2010_may2018['Date'].min()}")
print(f"End Date: {cleaned_df_jan2010_may2018['Date'].max()}")

Start Date: 1914-09-08 00:00:00
End Date: 2033-04-21 00:00:00


In [None]:
# Extract rows between 2010-01-01 and 2018-05-31
cleaned_df_jan2010_may2018 = cleaned_df_jan2010_may2018[(cleaned_df_jan2010_may2018['Date'] >= '2010-01-01') & (cleaned_df_jan2010_may2018['Date'] <= '2018-05-31')]

#### Rename Columns
* ['Date', 'Hour', 'Offense Type', 'Beat', 'Premise', 'Block Range', 'Street Name', 'Type', 'Suffix', 'Offenses']
-->
['Occurrence Date', 'Occurrence Hour', 'NIBRS Description', 'Beat', 'Premise', 'Block Range', 'Street Name', 'Street Type', 'Suffix', 'Offense Count']


In [None]:
# Rename ['Date', 'Hour', 'Offense Type', 'Beat', 'Premise', 'Block Range', 'Street Name', 'Type', 'Suffix', 'Offenses']
cleaned_df_jan2010_may2018.columns = ['Occurrence Date', 'Occurrence Hour', 'NIBRS Description', 'Beat', 'Premise',
                                      'Block Range', 'Street Name', 'Street Type', 'Suffix', 'Offense Count']


In [None]:
# Check shape
cleaned_df_jan2010_may2018.shape

(1050760, 10)

#### Change Column Order
* ['Occurrence Date', 'Occurrence Hour', 'NIBRS Description', 'Beat', 'Premise', 'Block Range', 'Street Name', 'Street Type', 'Suffix', 'Offense Count'] --> ['Occurrence Date', 'Occurrence Hour', 'NIBRS Description', 'Offense Count', 'Beat', 'Premise', 'Block Range', 'Street Name', 'Street Type', 'Suffix']

In [None]:
# Change the column order
cleaned_df_jan2010_may2018 = cleaned_df_jan2010_may2018[['Occurrence Date', 'Occurrence Hour', 'NIBRS Description', 'Offense Count', 'Beat',
                                                         'Premise', 'Block Range', 'Street Name', 'Street Type', 'Suffix']]

In [None]:
# Sort by 'Occurrence Date'
cleaned_df_jan2010_may2018.sort_values('Occurrence Date', inplace=True)

In [None]:
cleaned_df_jan2010_may2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1050760 entries, 387594 to 831912
Data columns (total 10 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   Occurrence Date    1050760 non-null  datetime64[ns]
 1   Occurrence Hour    1050760 non-null  object        
 2   NIBRS Description  1050760 non-null  object        
 3   Offense Count      1040450 non-null  float64       
 4   Beat               1050744 non-null  object        
 5   Premise            1017228 non-null  object        
 6   Block Range        1050604 non-null  object        
 7   Street Name        1050759 non-null  object        
 8   Street Type        1050760 non-null  object        
 9   Suffix             1050760 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 88.2+ MB


In [None]:
cleaned_df_jan2010_may2018.head()

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix
387594,2010-01-01,8,Theft,1.0,3B10,20R,4900-4999,POINCIANA,DR,-
383540,2010-01-01,18,Theft,1.0,5F20,20D,8700-8799,HAMMERLY,-,-
563390,2010-01-01,0,Burglary,1.0,1A10,05O,400-499,MAIN,ST,-
387303,2010-01-01,0,Theft,1.0,7C10,20R,1900-1999,LOCKWOOD,DR,-
303450,2010-01-01,10,Theft,1.0,18F20,18A,3300-3399,MCCUE,RD,-


#### Check Null

In [None]:
cleaned_df_jan2010_may2018.isnull().sum()

Occurrence Date          0
Occurrence Hour          0
NIBRS Description        0
Offense Count        10310
Beat                    16
Premise              33532
Block Range            156
Street Name              1
Street Type              0
Suffix                   0
dtype: int64

In [None]:
# Check the column header
cleaned_df_jan2010_may2018.columns

Index(['Occurrence Date', 'Occurrence Hour', 'NIBRS Description',
       'Offense Count', 'Beat', 'Premise', 'Block Range', 'Street Name',
       'Street Type', 'Suffix'],
      dtype='object')

#### Save cleaned_df_jan2010_may2018 to 'cleaned_jan2010_may2018.csv'

In [None]:
# Save df_jan2010_may2018 to cleaned_jan2010_may2018.csv
cleaned_df_jan2010_may2018.to_csv('data/cleaned_jan2010_may2018.csv', index=False)

## **Clean df_jun2010_dec2018**

In [None]:
df_jun2018_dec2018.head()

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix
0,2018-09-01,0,"Credit card, ATM fraud",1,5F30,Auto Dealership New/Used,12800-12899,HEMPSTEAD,RD,
1,2018-09-01,0,Intimidation,1,8C30,"Residence, Home (Includes Apartment)",8100-8199,SNOWDEN,,
2,2018-09-01,0,Weapon law violations,1,14D10,"Parking Lot, Garage",6600-6699,MADRID,ST,
3,2018-09-01,0,Trespass of real property,1,10H40,"Service, Gas Station",2100-2199,FANNIN,ST,
4,2018-09-01,0,Driving under the influence,1,9C20,"Highway, Road, Street, Alley",6100-6199,EAST,FWY,


In [None]:
col_names_jun2018_dec2018 = ['Occurrence Date', 'Occurrence Hour', 'NIBRS Description' 'Offense Count', 'Beat', 'Premise',
                 'Block Range', 'Street Name', 'Street Type', 'Suffix']

In [None]:
cols_jun2018_dec2018 = df_jun2018_dec2018.columns
cols_jun2018_dec2018

Index(['Occurrence Date', 'Occurrence Hour', 'NIBRS Description',
       'Offense Count', 'Beat', 'Premise', 'Block Range', 'Street Name',
       'Street Type', 'Suffix'],
      dtype='object')

In [None]:
# Check df_jan2010_may2018
df_jun2018_dec2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145713 entries, 0 to 145712
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Occurrence Date    145713 non-null  object
 1   Occurrence Hour    145713 non-null  int64 
 2   NIBRS Description  145713 non-null  object
 3   Offense Count      145713 non-null  int64 
 4   Beat               145564 non-null  object
 5   Premise            145713 non-null  object
 6   Block Range        144823 non-null  object
 7   Street Name        145713 non-null  object
 8   Street Type        133976 non-null  object
 9   Suffix             20532 non-null   object
dtypes: int64(2), object(8)
memory usage: 11.1+ MB


#### Remove Duplicates

In [None]:
# Check duplicates
print(f"df_jun2018_dec2018 contains {df_jun2018_dec2018.duplicated().sum()} duplicates")
# Check all cuplicates
duplicates = df_jun2018_dec2018.duplicated(keep=False)
df_jun2018_dec2018[duplicates].sort_values('Occurrence Date')

df_jun2018_dec2018 contains 658 duplicates


Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix
81743,2018-06-01,11,All other larceny,1,10H40,School-Elementary/Secondary,1800-1899,STUART,,
81744,2018-06-01,11,All other larceny,1,10H40,School-Elementary/Secondary,1800-1899,STUART,,
82277,2018-06-02,0,Theft from motor vehicle,1,15E40,"Parking Lot, Garage",8800-8899,LAKES AT 610,DR,
82286,2018-06-02,0,Theft from motor vehicle,1,15E40,"Parking Lot, Garage",8800-8899,LAKES AT 610,DR,
82436,2018-06-02,9,"Burglary, Breaking and Entering",1,11H10,"Commercial, Office Building",2100-2199,WAYSIDE,DR,S
...,...,...,...,...,...,...,...,...,...,...
80682,2018-12-30,20,Theft from motor vehicle,1,5F30,Restaurant,12900-12999,NORTHWEST,FWY,
80943,2018-12-31,8,All other offenses,1,14D30,"Government, Public Building",8300-8399,MYKAWA,RD,
80946,2018-12-31,8,All other offenses,1,14D30,"Government, Public Building",8300-8399,MYKAWA,RD,
81154,2018-12-31,15,Theft from motor vehicle,1,13D40,"Parking Lot, Garage",8400-8499,SAM HOUSTON,PKWY,S


In [None]:
# Remove duplicates
cleaned_df_jun2018_dec2018 = df_jun2018_dec2018.drop_duplicates()

In [None]:
# Check the date range
print(f"Start Date: {cleaned_df_jun2018_dec2018['Occurrence Date'].min()}")
print(f"End Date: {cleaned_df_jun2018_dec2018['Occurrence Date'].max()}")

Start Date: 2018-06-01
End Date: 2018-12-31


In [None]:
# Check shape
cleaned_df_jun2018_dec2018.shape

(145055, 10)

In [None]:
cleaned_df_jun2018_dec2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145055 entries, 0 to 145712
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Occurrence Date    145055 non-null  object
 1   Occurrence Hour    145055 non-null  int64 
 2   NIBRS Description  145055 non-null  object
 3   Offense Count      145055 non-null  int64 
 4   Beat               144906 non-null  object
 5   Premise            145055 non-null  object
 6   Block Range        144165 non-null  object
 7   Street Name        145055 non-null  object
 8   Street Type        133355 non-null  object
 9   Suffix             20455 non-null   object
dtypes: int64(2), object(8)
memory usage: 12.2+ MB


In [None]:
cleaned_df_jun2018_dec2018.sort_values('Occurrence Date', inplace=True)

In [None]:
cleaned_df_jun2018_dec2018.head()

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix
82109,2018-06-01,20,Simple assault,1,20G40,"Residence, Home (Includes Apartment)",11700-11799,WESTHEIMER,RD,
81979,2018-06-01,17,All other offenses,1,8C20,"Highway, Road, Street, Alley",8000-8099,DARIEN,ST,
81980,2018-06-01,17,Simple assault,1,20G80,"Residence, Home (Includes Apartment)",15100-15199,RICHMOND,AVE,
81981,2018-06-01,17,"Destruction, damage, vandalism",1,19G40,"Residence, Home (Includes Apartment)",12000-12099,BISSONNET,ST,
81982,2018-06-01,17,Shoplifting,1,18F20,Shopping Mall,5000-5099,WESTHEIMER,RD,


#### Check Null

In [None]:
cleaned_df_jun2018_dec2018.isnull().sum()

Occurrence Date           0
Occurrence Hour           0
NIBRS Description         0
Offense Count             0
Beat                    149
Premise                   0
Block Range             890
Street Name               0
Street Type           11700
Suffix               124600
dtype: int64

In [None]:
cleaned_df_jun2018_dec2018.columns

Index(['Occurrence Date', 'Occurrence Hour', 'NIBRS Description',
       'Offense Count', 'Beat', 'Premise', 'Block Range', 'Street Name',
       'Street Type', 'Suffix'],
      dtype='object')

#### Save cleaned_df_jun2018_dec2018 to 'cleaned_jun2018_dec2018.csv'

In [None]:
# Save df_jun2018_dec2018 to cleaned_jun2018_dec2018.csv
cleaned_df_jun2018_dec2018.to_csv('data/cleaned_jun2018_dec2018.csv', index=False)

## **Clean df_jan2019_jul2023**

In [None]:
df_jan2019_jul2023.head()

Unnamed: 0,Incident,Occurrence\nDate,Occurrence\nHour,NIBRS\nClass,NIBRSDescription,Offense\nCount,Beat,Premise,Block Range,StreetName,...,ZIP Code,RMSOccurrenceDate,RMSOccurrenceHour,NIBRSClass,OffenseCount,StreetNo,StreetType,ZIPCode,MapLongitude,MapLatitude
0,5619,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,9C30,"Residence, Home (Includes Apartment)",9622.0,SAN CARLOS,...,77013,,,,,,,,,
1,17319,2019-01-01,0.0,35A,"Drug, narcotic violations",1.0,7C10,"Highway, Road, Street, Alley",,EAST,...,77020,,,,,,,,,
2,18119,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,16E40,"Residence, Home (Includes Apartment)",16718.0,LONE QUAIL,...,77489,,,,,,,,,
3,19019,2019-01-01,0.0,520,Weapon law violations,1.0,,"Residence, Home (Includes Apartment)",1909.0,MELBOURNE,...,77026-0000,,,,,,,,,
4,20519,2019-01-01,0.0,13A,Aggravated Assault,1.0,15E30,"Residence, Home (Includes Apartment)",4034.0,OSBY,...,77025,,,,,,,,,


In [None]:
cols_jan2019_jul2023 = df_jan2019_jul2023.columns
cols_jan2019_jul2023

Index(['Incident', 'Occurrence\nDate', 'Occurrence\nHour', 'NIBRS\nClass',
       'NIBRSDescription', 'Offense\nCount', 'Beat', 'Premise', 'Block Range',
       'StreetName', 'Street\nType', 'Suffix', 'City', 'ZIP Code',
       'RMSOccurrenceDate', 'RMSOccurrenceHour', 'NIBRSClass', 'OffenseCount',
       'StreetNo', 'StreetType', 'ZIPCode', 'MapLongitude', 'MapLatitude'],
      dtype='object')

#### Restore Point

In [None]:
temp = df_jan2019_jul2023.copy()

#### Rename Columns
    * 'Occurrence\nDate': 'Occurrence Date', 'Occurrence\nHour': 'Occurrence Hour','NIBRS\nClass': 'NIBRS Class', 'NIBRSDescription': 'NIBRS Description', 'Offense\nCount': 'Offense Count', 'StreetName': 'Street Name', 'StreetNo': 'Street No', 'Street\nType': 'Street Type'

In [None]:
# Rename columns
cols_to_rename = {'Occurrence\nDate': 'Occurrence Date', 'Occurrence\nHour': 'Occurrence Hour','NIBRS\nClass': 'NIBRS Class',
                  'NIBRSDescription': 'NIBRS Description', 'Offense\nCount': 'Offense Count', 'StreetName': 'Street Name', 'StreetNo': 'Street No',
                  'Street\nType': 'Street Type'}
df_jan2019_jul2023.rename(columns=cols_to_rename, inplace=True)

In [None]:
df_jan2019_jul2023.head()

Unnamed: 0,Incident,Occurrence Date,Occurrence Hour,NIBRS Class,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,...,ZIP Code,RMSOccurrenceDate,RMSOccurrenceHour,NIBRSClass,OffenseCount,Street No,StreetType,ZIPCode,MapLongitude,MapLatitude
0,5619,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,9C30,"Residence, Home (Includes Apartment)",9622.0,SAN CARLOS,...,77013,,,,,,,,,
1,17319,2019-01-01,0.0,35A,"Drug, narcotic violations",1.0,7C10,"Highway, Road, Street, Alley",,EAST,...,77020,,,,,,,,,
2,18119,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,16E40,"Residence, Home (Includes Apartment)",16718.0,LONE QUAIL,...,77489,,,,,,,,,
3,19019,2019-01-01,0.0,520,Weapon law violations,1.0,,"Residence, Home (Includes Apartment)",1909.0,MELBOURNE,...,77026-0000,,,,,,,,,
4,20519,2019-01-01,0.0,13A,Aggravated Assault,1.0,15E30,"Residence, Home (Includes Apartment)",4034.0,OSBY,...,77025,,,,,,,,,


#### Copy Values from One Column to Another Column                  
    * 'RMSOccurrenceDate' --> 'Occurrence Date'
    * 'RMSOccurrenceHour' --> 'Occurrence Hour'
    * 'NIBRSClass' --> 'NIBRS Class'
    * 'OffenseCount' --> 'Offense Count'
    * 'ZIPCode' --> 'ZIP Code'
    * 'StreetType' --> 'Street Type'

In [None]:
# Copy values in 'RMSOccurrenceDate' to 'Occurrence Date'
idx = df_jan2019_jul2023['Occurrence Date'].isnull() & ~df_jan2019_jul2023['RMSOccurrenceDate'].isnull()
df_jan2019_jul2023['Occurrence Date'] = [df_jan2019_jul2023['RMSOccurrenceDate'][i] if idx[i] else df_jan2019_jul2023['Occurrence Date'][i] for i in range(len(df_jan2019_jul2023))]

In [None]:
# Copy values in 'RMSOccurrenceHour' to 'Occurrence Hour'
idx = df_jan2019_jul2023['Occurrence Hour'].isnull() & ~df_jan2019_jul2023['RMSOccurrenceHour'].isnull()
df_jan2019_jul2023['Occurrence Hour'] = [df_jan2019_jul2023['RMSOccurrenceHour'][i] if idx[i] else df_jan2019_jul2023['Occurrence Hour'][i] for i in range(len(df_jan2019_jul2023))]

In [None]:
# Copy values in 'NIBRSClass' to 'NIBRS Class'
idx = df_jan2019_jul2023['NIBRS Class'].isnull() & ~df_jan2019_jul2023['NIBRSClass'].isnull()
df_jan2019_jul2023['NIBRS Class'] = [df_jan2019_jul2023['NIBRSClass'][i] if idx[i] else df_jan2019_jul2023['NIBRS Class'][i] for i in range(len(df_jan2019_jul2023))]

In [None]:
# Copy values in 'OffenseCount' to 'Offense Count'
idx = df_jan2019_jul2023['Offense Count'].isnull() & ~df_jan2019_jul2023['OffenseCount'].isnull()
df_jan2019_jul2023['Offense Count'] = [df_jan2019_jul2023['OffenseCount'][i] if idx[i] else df_jan2019_jul2023['Offense Count'][i] for i in range(len(df_jan2019_jul2023))]

In [None]:
# Copy values in 'ZIPCode' to 'ZIP Code'
idx = df_jan2019_jul2023['ZIP Code'].isnull() & ~df_jan2019_jul2023['ZIPCode'].isnull()
df_jan2019_jul2023['ZIP Code'] = [df_jan2019_jul2023['ZIPCode'][i] if idx[i] else df_jan2019_jul2023['ZIP Code'][i] for i in range(len(df_jan2019_jul2023))]

In [None]:
# Copy values in 'StreetType' to 'Street Type'
idx = df_jan2019_jul2023['Street Type'].isnull() & ~df_jan2019_jul2023['StreetType'].isnull()
df_jan2019_jul2023['Street Type'] = [df_jan2019_jul2023['StreetType'][i] if idx[i] else df_jan2019_jul2023['Street Type'][i] for i in range(len(df_jan2019_jul2023))]

In [None]:
# Check df
df_jan2019_jul2023.head()

Unnamed: 0,Incident,Occurrence Date,Occurrence Hour,NIBRS Class,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,...,ZIP Code,RMSOccurrenceDate,RMSOccurrenceHour,NIBRSClass,OffenseCount,Street No,StreetType,ZIPCode,MapLongitude,MapLatitude
0,5619,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,9C30,"Residence, Home (Includes Apartment)",9622.0,SAN CARLOS,...,77013,,,,,,,,,
1,17319,2019-01-01,0.0,35A,"Drug, narcotic violations",1.0,7C10,"Highway, Road, Street, Alley",,EAST,...,77020,,,,,,,,,
2,18119,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,16E40,"Residence, Home (Includes Apartment)",16718.0,LONE QUAIL,...,77489,,,,,,,,,
3,19019,2019-01-01,0.0,520,Weapon law violations,1.0,,"Residence, Home (Includes Apartment)",1909.0,MELBOURNE,...,77026-0000,,,,,,,,,
4,20519,2019-01-01,0.0,13A,Aggravated Assault,1.0,15E30,"Residence, Home (Includes Apartment)",4034.0,OSBY,...,77025,,,,,,,,,


In [None]:
# Check df_jan2010_may2018
df_jan2019_jul2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111335 entries, 0 to 1111334
Data columns (total 23 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Incident           1111335 non-null  int64  
 1   Occurrence Date    1111335 non-null  object 
 2   Occurrence Hour    1111335 non-null  float64
 3   NIBRS Class        1111335 non-null  object 
 4   NIBRS Description  1111335 non-null  object 
 5   Offense Count      1111335 non-null  float64
 6   Beat               1110550 non-null  object 
 7   Premise            1111334 non-null  object 
 8   Block Range        465475 non-null   object 
 9   Street Name        1111335 non-null  object 
 10  Street Type        1025784 non-null  object 
 11  Suffix             162878 non-null   object 
 12  City               1111335 non-null  object 
 13  ZIP Code           1097813 non-null  object 
 14  RMSOccurrenceDate  644390 non-null   object 
 15  RMSOccurrenceHour  644390 non-nu

#### Remove Duplicates and Drop Unnecesarry Columns
    * 'RMSOccurrenceDate', 'RMSOccurrenceHour', 'NIBRSClass', 'OffenseCount', 'StreetType', 'ZIPCode'

In [None]:
# Check duplicates
print(f"df_jan2019_jul2023 contains {df_jan2019_jul2023.duplicated().sum()} duplicates")
# Check all cuplicates
duplicates = df_jan2019_jul2023.duplicated(keep=False)
df_jan2019_jul2023[duplicates].sort_values('Occurrence Date')

df_jan2019_jul2023 contains 0 duplicates


Unnamed: 0,Incident,Occurrence Date,Occurrence Hour,NIBRS Class,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,...,ZIP Code,RMSOccurrenceDate,RMSOccurrenceHour,NIBRSClass,OffenseCount,Street No,StreetType,ZIPCode,MapLongitude,MapLatitude


In [None]:
df_jan2019_jul2023.columns

Index(['Incident', 'Occurrence Date', 'Occurrence Hour', 'NIBRS Class',
       'NIBRS Description', 'Offense Count', 'Beat', 'Premise', 'Block Range',
       'Street Name', 'Street Type', 'Suffix', 'City', 'ZIP Code',
       'RMSOccurrenceDate', 'RMSOccurrenceHour', 'NIBRSClass', 'OffenseCount',
       'Street No', 'StreetType', 'ZIPCode', 'MapLongitude', 'MapLatitude'],
      dtype='object')

In [None]:
# Drop 'RMSOccurrenceDate', 'RMSOccurrenceHour', 'NIBRSClass', 'OffenseCount', 'StreetType', 'ZIPCode'
drop_cols = ['RMSOccurrenceDate', 'RMSOccurrenceHour', 'NIBRSClass', 'OffenseCount', 'StreetType', 'ZIPCode']
cleaned_df_jan2019_jul2023 = df_jan2019_jul2023.drop(drop_cols, axis=1)

In [None]:
# # Filter Houston Datapoints
# cleaned_df_jan2019_jul2023 = cleaned_df_jan2019_jul2023[cleaned_df_jan2019_jul2023['City'] == 'HOUSTON']

In [None]:
# # Drop 'City'
# cleaned_df_jan2019_jul2023.drop('City', axis=1, inplace=True)

In [None]:
# Check the date range
print(f"Start Date: {cleaned_df_jan2019_jul2023['Occurrence Date'].min()}")
print(f"End Date: {cleaned_df_jan2019_jul2023['Occurrence Date'].max()}")

Start Date: 2019-01-01
End Date: 2023-07-31


In [None]:
# Check shape
cleaned_df_jan2019_jul2023.shape

(1111335, 17)

In [None]:
cleaned_df_jan2019_jul2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111335 entries, 0 to 1111334
Data columns (total 17 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Incident           1111335 non-null  int64  
 1   Occurrence Date    1111335 non-null  object 
 2   Occurrence Hour    1111335 non-null  float64
 3   NIBRS Class        1111335 non-null  object 
 4   NIBRS Description  1111335 non-null  object 
 5   Offense Count      1111335 non-null  float64
 6   Beat               1110550 non-null  object 
 7   Premise            1111334 non-null  object 
 8   Block Range        465475 non-null   object 
 9   Street Name        1111335 non-null  object 
 10  Street Type        1025784 non-null  object 
 11  Suffix             162878 non-null   object 
 12  City               1111335 non-null  object 
 13  ZIP Code           1097813 non-null  object 
 14  Street No          642415 non-null   object 
 15  MapLongitude       392147 non-nu

In [None]:
cleaned_df_jan2019_jul2023.head()

Unnamed: 0,Incident,Occurrence Date,Occurrence Hour,NIBRS Class,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix,City,ZIP Code,Street No,MapLongitude,MapLatitude
0,5619,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,9C30,"Residence, Home (Includes Apartment)",9622.0,SAN CARLOS,,,HOUSTON,77013,,,
1,17319,2019-01-01,0.0,35A,"Drug, narcotic violations",1.0,7C10,"Highway, Road, Street, Alley",,EAST,FWY,,HOUSTON,77020,,,
2,18119,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,16E40,"Residence, Home (Includes Apartment)",16718.0,LONE QUAIL,CT,,HOUSTON,77489,,,
3,19019,2019-01-01,0.0,520,Weapon law violations,1.0,,"Residence, Home (Includes Apartment)",1909.0,MELBOURNE,,,HOUSTON,77026-0000,,,
4,20519,2019-01-01,0.0,13A,Aggravated Assault,1.0,15E30,"Residence, Home (Includes Apartment)",4034.0,OSBY,DR,,HOUSTON,77025,,,


#### Check Null

In [None]:
cleaned_df_jan2019_jul2023.isnull().sum()

Incident                  0
Occurrence Date           0
Occurrence Hour           0
NIBRS Class               0
NIBRS Description         0
Offense Count             0
Beat                    785
Premise                   1
Block Range          645860
Street Name               0
Street Type           85551
Suffix               948457
City                      0
ZIP Code              13522
Street No            468920
MapLongitude         719188
MapLatitude          719188
dtype: int64

In [None]:
cleaned_df_jan2019_jul2023.columns

Index(['Incident', 'Occurrence Date', 'Occurrence Hour', 'NIBRS Class',
       'NIBRS Description', 'Offense Count', 'Beat', 'Premise', 'Block Range',
       'Street Name', 'Street Type', 'Suffix', 'City', 'ZIP Code', 'Street No',
       'MapLongitude', 'MapLatitude'],
      dtype='object')

#### Save cleaned_df_jan2019_jul2023 to cleaned_jan2019_jul2023.csv

In [None]:
# Save cleaned_df_jan2019_jul2023 to cleaned_jan2019_jul2023.csv
cleaned_df_jan2019_jul2023.to_csv('data/cleaned_jan2019_jul2023.csv', index=False)

## **Verify Dataframes**

In [None]:
# Import cleaned_jan2010_may2018.csv', cleaned_jun2018_dec2018.csv', and cleaned_jan2019_jul2023.csv
df1 = pd.read_csv('data/cleaned_jan2010_may2018.csv', parse_dates=['Occurrence Date'], dtype={'Beat': str, 'Offense Count': float})
df2 = pd.read_csv('data/cleaned_jun2018_dec2018.csv', parse_dates=['Occurrence Date'], dtype={'Beat': str, 'Offense Count': float})
df3 = pd.read_csv('data/cleaned_jan2019_jul2023.csv', parse_dates=['Occurrence Date'], dtype={'Beat': str, 'ZIP Code': str, 'Offense Count': float})

In [None]:
df1.sort_values('Occurrence Date').head()
df1[df1['Occurrence Date'] == '1966-01-01']

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix


In [None]:
numdf1 = len(df1)
numdf2 = len(df2)
numdf3 = len(df3)

print(numdf1, numdf2, numdf3, numdf1+numdf2+numdf3)

1050760 145055 1111335 2307150


## **Save Dataframe to Pickle**

In [None]:
# Save dataframes in pickle format
cleaned_df_jan2010_may2018.to_pickle('data/cleaned_jan2010_may2018.pkl')
cleaned_df_jun2018_dec2018.to_pickle('data/cleaned_jun2018_dec2018.pkl')
cleaned_df_jan2019_jul2023.to_pickle('data/cleaned_jan2019_jul2023.pkl')

## **Read Pickle Files**

In [None]:
# with open('data/cleaned_jan2010_may2018.pkl', 'rb') as f:
#   cleaned_df_jan2010_may2018 = pickle.load(f)

# with open('data/cleaned_jun2018_dec2018.pkl', 'rb') as f:
#   cleaned_df_jun2018_dec2018 = pickle.load(f)

# with open('data/cleaned_jan2019_jul2023.pkl', 'rb') as f:
#   cleaned_df_jan2019_jul2023 = pickle.load(f)

In [None]:
cleaned_df_jan2010_may2018.shape

(1050760, 10)

In [None]:
cleaned_df_jan2010_may2018.head()

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix
387594,2010-01-01,8,Theft,1.0,3B10,20R,4900-4999,POINCIANA,DR,-
383540,2010-01-01,18,Theft,1.0,5F20,20D,8700-8799,HAMMERLY,-,-
563390,2010-01-01,0,Burglary,1.0,1A10,05O,400-499,MAIN,ST,-
387303,2010-01-01,0,Theft,1.0,7C10,20R,1900-1999,LOCKWOOD,DR,-
303450,2010-01-01,10,Theft,1.0,18F20,18A,3300-3399,MCCUE,RD,-


In [None]:
cleaned_df_jan2010_may2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1050760 entries, 387594 to 831912
Data columns (total 10 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   Occurrence Date    1050760 non-null  datetime64[ns]
 1   Occurrence Hour    1050760 non-null  object        
 2   NIBRS Description  1050760 non-null  object        
 3   Offense Count      1040450 non-null  float64       
 4   Beat               1050744 non-null  object        
 5   Premise            1017228 non-null  object        
 6   Block Range        1050604 non-null  object        
 7   Street Name        1050759 non-null  object        
 8   Street Type        1050760 non-null  object        
 9   Suffix             1050760 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 88.2+ MB


In [None]:
cleaned_df_jun2018_dec2018.shape

(145055, 10)

In [None]:
cleaned_df_jun2018_dec2018.head()

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix
82109,2018-06-01,20,Simple assault,1,20G40,"Residence, Home (Includes Apartment)",11700-11799,WESTHEIMER,RD,
81979,2018-06-01,17,All other offenses,1,8C20,"Highway, Road, Street, Alley",8000-8099,DARIEN,ST,
81980,2018-06-01,17,Simple assault,1,20G80,"Residence, Home (Includes Apartment)",15100-15199,RICHMOND,AVE,
81981,2018-06-01,17,"Destruction, damage, vandalism",1,19G40,"Residence, Home (Includes Apartment)",12000-12099,BISSONNET,ST,
81982,2018-06-01,17,Shoplifting,1,18F20,Shopping Mall,5000-5099,WESTHEIMER,RD,


In [None]:
cleaned_df_jun2018_dec2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145055 entries, 82109 to 80977
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Occurrence Date    145055 non-null  object
 1   Occurrence Hour    145055 non-null  int64 
 2   NIBRS Description  145055 non-null  object
 3   Offense Count      145055 non-null  int64 
 4   Beat               144906 non-null  object
 5   Premise            145055 non-null  object
 6   Block Range        144165 non-null  object
 7   Street Name        145055 non-null  object
 8   Street Type        133355 non-null  object
 9   Suffix             20455 non-null   object
dtypes: int64(2), object(8)
memory usage: 12.2+ MB


In [None]:
cleaned_df_jan2019_jul2023.shape

(1111335, 17)

In [None]:
cleaned_df_jan2019_jul2023.head()

Unnamed: 0,Incident,Occurrence Date,Occurrence Hour,NIBRS Class,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix,City,ZIP Code,Street No,MapLongitude,MapLatitude
0,5619,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,9C30,"Residence, Home (Includes Apartment)",9622.0,SAN CARLOS,,,HOUSTON,77013,,,
1,17319,2019-01-01,0.0,35A,"Drug, narcotic violations",1.0,7C10,"Highway, Road, Street, Alley",,EAST,FWY,,HOUSTON,77020,,,
2,18119,2019-01-01,0.0,290,"Destruction, damage, vandalism",1.0,16E40,"Residence, Home (Includes Apartment)",16718.0,LONE QUAIL,CT,,HOUSTON,77489,,,
3,19019,2019-01-01,0.0,520,Weapon law violations,1.0,,"Residence, Home (Includes Apartment)",1909.0,MELBOURNE,,,HOUSTON,77026-0000,,,
4,20519,2019-01-01,0.0,13A,Aggravated Assault,1.0,15E30,"Residence, Home (Includes Apartment)",4034.0,OSBY,DR,,HOUSTON,77025,,,


In [None]:
cleaned_df_jan2019_jul2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111335 entries, 0 to 1111334
Data columns (total 17 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Incident           1111335 non-null  int64  
 1   Occurrence Date    1111335 non-null  object 
 2   Occurrence Hour    1111335 non-null  float64
 3   NIBRS Class        1111335 non-null  object 
 4   NIBRS Description  1111335 non-null  object 
 5   Offense Count      1111335 non-null  float64
 6   Beat               1110550 non-null  object 
 7   Premise            1111334 non-null  object 
 8   Block Range        465475 non-null   object 
 9   Street Name        1111335 non-null  object 
 10  Street Type        1025784 non-null  object 
 11  Suffix             162878 non-null   object 
 12  City               1111335 non-null  object 
 13  ZIP Code           1097813 non-null  object 
 14  Street No          642415 non-null   object 
 15  MapLongitude       392147 non-nu

## **Merge cleaned_jan2010_may2018.csv, cleaned_jun2018_dec2018.csv, and cleaned_jan2019_jul2023.csv**

In [None]:
# Merge cleaned_jan2010_may2018.csv, cleaned_jun2018_dec2018.csv, and cleaned_jan2019_jul2023.csv
merged_jan2010_Jul2023 = pd.concat([cleaned_df_jan2010_may2018, cleaned_df_jun2018_dec2018, cleaned_df_jan2019_jul2023])


In [None]:
# Check the merged dataframe
merged_jan2010_Jul2023.head()

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix,Incident,NIBRS Class,City,ZIP Code,Street No,MapLongitude,MapLatitude
387594,2010-01-01 00:00:00,8,Theft,1.0,3B10,20R,4900-4999,POINCIANA,DR,-,,,,,,,
383540,2010-01-01 00:00:00,18,Theft,1.0,5F20,20D,8700-8799,HAMMERLY,-,-,,,,,,,
563390,2010-01-01 00:00:00,0,Burglary,1.0,1A10,05O,400-499,MAIN,ST,-,,,,,,,
387303,2010-01-01 00:00:00,0,Theft,1.0,7C10,20R,1900-1999,LOCKWOOD,DR,-,,,,,,,
303450,2010-01-01 00:00:00,10,Theft,1.0,18F20,18A,3300-3399,MCCUE,RD,-,,,,,,,


In [None]:
merged_jan2010_Jul2023.tail()

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix,Incident,NIBRS Class,City,ZIP Code,Street No,MapLongitude,MapLatitude
1111330,2023-07-15,12.0,Theft from motor vehicle,1.0,,"Highway, Road, Street, Alley",,VICTOR,ST,,100500323.0,23F,HOUSTON,,700,,
1111331,2023-07-15,12.0,"Destruction, damage, vandalism",1.0,,"Highway, Road, Street, Alley",,VICTOR,ST,,100500323.0,290,HOUSTON,,700,,
1111332,2023-07-16,16.0,Trespass of real property,1.0,,"Residence, Home (Includes Apartment)",,CULLEN,BLVD,,101013423.0,90J,CU,,7300,-95.365814,29.683107
1111333,2023-07-28,7.0,Drug equipment violations,1.0,,"Highway, Road, Street, Alley",,TIDWELL,RD,,107036723.0,35B,HOUSTON,,4840,,
1111334,2023-07-29,3.0,Aggravated Assault,1.0,,"Drug Store, Doctors Office, Hospital",,SAINT JOSEPHS,PKWY,,107531223.0,13A,HOUSTON,77002.0,1401,,


In [None]:
merged_jan2010_Jul2023.shape

(2307150, 17)

In [None]:
merged_jan2010_Jul2023.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2307150 entries, 387594 to 1111334
Data columns (total 17 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Occurrence Date    object 
 1   Occurrence Hour    object 
 2   NIBRS Description  object 
 3   Offense Count      float64
 4   Beat               object 
 5   Premise            object 
 6   Block Range        object 
 7   Street Name        object 
 8   Street Type        object 
 9   Suffix             object 
 10  Incident           float64
 11  NIBRS Class        object 
 12  City               object 
 13  ZIP Code           object 
 14  Street No          object 
 15  MapLongitude       float64
 16  MapLatitude        float64
dtypes: float64(4), object(13)
memory usage: 316.8+ MB


In [None]:
merged_jan2010_Jul2023.isna().sum()

Occurrence Date            0
Occurrence Hour            0
NIBRS Description          0
Offense Count          10310
Beat                     950
Premise                33533
Block Range           646906
Street Name                1
Street Type            97251
Suffix               1073057
Incident             1195815
NIBRS Class          1195815
City                 1195815
ZIP Code             1209337
Street No            1664735
MapLongitude         1915003
MapLatitude          1915003
dtype: int64

In [None]:
merged_jan2010_Jul2023.describe()

Unnamed: 0,Offense Count,Incident,MapLongitude,MapLatitude
count,2296840.0,1111335.0,392147.0,392147.0
mean,1.026311,83185210.0,-95.416751,29.755169
std,0.2115527,50026110.0,0.111731,0.093016
min,0.0,520.0,-95.997926,29.3858
25%,1.0,40859220.0,-95.504777,29.688916
50%,1.0,81176520.0,-95.411035,29.740215
75%,1.0,124180300.0,-95.342364,29.812019
max,65.0,996249100.0,-94.870391,30.219917


In [None]:
merged_jan2010_Jul2023.describe(include='object')

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Beat,Premise,Block Range,Street Name,Street Type,Suffix,NIBRS Class,City,ZIP Code,Street No
count,2307150,2307150,2307150,2306200,2273617,1660244,2307149,2209899,1234093,1111335,1111335,1097813,642415
unique,4960,83,77,368,428,14578,35934,56,5,62,308,479,15154
top,2022-04-01,12,Theft,12D10,"Residence, Home (Includes Apartment)",900-999,WESTHEIMER,ST,-,23F,HOUSTON,77036,2800
freq,900,133998,573377,47541,460679,16556,63948,509835,908229,127621,1106033,37286,2273


## **Save Final Merged Dataframe to pkl and CSV**

In [None]:
# Save dataframes in pickle format
merged_jan2010_Jul2023.to_pickle('data/df_merged_jan2010_Jul2023.pkl')

In [None]:
merged_jan2010_Jul2023.to_csv('data/merged_jan2010_Jul2023.csv', index=False)

## **Check merged_jan2010_jul2023.csv**

In [None]:
final_df = pd.read_csv('data/merged_jan2010_Jul2023.csv', parse_dates=['Occurrence Date'], dtype={'Beat': str, 'ZIP Code': str, 'Offense Count': float, 'numeric': int})

In [None]:
final_df.head()

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix,Incident,NIBRS Class,City,ZIP Code,Street No,MapLongitude,MapLatitude
0,2010-01-01,8.0,Theft,1.0,3B10,20R,4900-4999,POINCIANA,DR,-,,,,,,,
1,2010-01-01,18.0,Theft,1.0,5F20,20D,8700-8799,HAMMERLY,-,-,,,,,,,
2,2010-01-01,0.0,Burglary,1.0,1A10,05O,400-499,MAIN,ST,-,,,,,,,
3,2010-01-01,0.0,Theft,1.0,7C10,20R,1900-1999,LOCKWOOD,DR,-,,,,,,,
4,2010-01-01,10.0,Theft,1.0,18F20,18A,3300-3399,MCCUE,RD,-,,,,,,,


In [None]:
final_df.tail()

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix,Incident,NIBRS Class,City,ZIP Code,Street No,MapLongitude,MapLatitude
2307145,2023-07-15,12.0,Theft from motor vehicle,1.0,,"Highway, Road, Street, Alley",,VICTOR,ST,,100500323.0,23F,HOUSTON,,700,,
2307146,2023-07-15,12.0,"Destruction, damage, vandalism",1.0,,"Highway, Road, Street, Alley",,VICTOR,ST,,100500323.0,290,HOUSTON,,700,,
2307147,2023-07-16,16.0,Trespass of real property,1.0,,"Residence, Home (Includes Apartment)",,CULLEN,BLVD,,101013423.0,90J,CU,,7300,-95.365814,29.683107
2307148,2023-07-28,7.0,Drug equipment violations,1.0,,"Highway, Road, Street, Alley",,TIDWELL,RD,,107036723.0,35B,HOUSTON,,4840,,
2307149,2023-07-29,3.0,Aggravated Assault,1.0,,"Drug Store, Doctors Office, Hospital",,SAINT JOSEPHS,PKWY,,107531223.0,13A,HOUSTON,77002.0,1401,,


In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2307150 entries, 0 to 2307149
Data columns (total 17 columns):
 #   Column             Dtype         
---  ------             -----         
 0   Occurrence Date    datetime64[ns]
 1   Occurrence Hour    object        
 2   NIBRS Description  object        
 3   Offense Count      float64       
 4   Beat               object        
 5   Premise            object        
 6   Block Range        object        
 7   Street Name        object        
 8   Street Type        object        
 9   Suffix             object        
 10  Incident           float64       
 11  NIBRS Class        object        
 12  City               object        
 13  ZIP Code           object        
 14  Street No          object        
 15  MapLongitude       float64       
 16  MapLatitude        float64       
dtypes: datetime64[ns](1), float64(4), object(12)
memory usage: 299.2+ MB


In [None]:
final_df.describe()

Unnamed: 0,Offense Count,Incident,MapLongitude,MapLatitude
count,2296840.0,1111335.0,392147.0,392147.0
mean,1.026311,83185210.0,-95.416751,29.755169
std,0.2115527,50026110.0,0.111731,0.093016
min,0.0,520.0,-95.997926,29.3858
25%,1.0,40859220.0,-95.504777,29.688916
50%,1.0,81176520.0,-95.411035,29.740215
75%,1.0,124180300.0,-95.342364,29.812019
max,65.0,996249100.0,-94.870391,30.219917


In [None]:
final_df.describe(include='all')

Unnamed: 0,Occurrence Date,Occurrence Hour,NIBRS Description,Offense Count,Beat,Premise,Block Range,Street Name,Street Type,Suffix,Incident,NIBRS Class,City,ZIP Code,Street No,MapLongitude,MapLatitude
count,2307150,2307150.0,2307150,2296840.0,2306200,2273617,1660244,2307149,2209899,1234093,1111335.0,1111335,1111335,1097813.0,642415.0,392147.0,392147.0
unique,4960,100.0,77,,368,428,14578,35934,56,5,,62,308,479.0,15154.0,,
top,2022-04-01 00:00:00,12.0,Theft,,12D10,"Residence, Home (Includes Apartment)",900-999,WESTHEIMER,ST,-,,23F,HOUSTON,77036.0,2800.0,,
freq,900,119378.0,573377,,47541,460679,16556,63948,509835,908229,,127621,1106033,37286.0,2273.0,,
first,2010-01-01 00:00:00,,,,,,,,,,,,,,,,
last,2023-07-31 00:00:00,,,,,,,,,,,,,,,,
mean,,,,1.026311,,,,,,,83185210.0,,,,,-95.416751,29.755169
std,,,,0.2115527,,,,,,,50026110.0,,,,,0.111731,0.093016
min,,,,0.0,,,,,,,520.0,,,,,-95.997926,29.3858
25%,,,,1.0,,,,,,,40859220.0,,,,,-95.504777,29.688916


In [None]:
final_df.isna().sum()

Occurrence Date            0
Occurrence Hour            0
NIBRS Description          0
Offense Count          10310
Beat                     950
Premise                33533
Block Range           646906
Street Name                1
Street Type            97251
Suffix               1073057
Incident             1195815
NIBRS Class          1195815
City                 1195815
ZIP Code             1209337
Street No            1664735
MapLongitude         1915003
MapLatitude          1915003
dtype: int64