In [1]:
import pandas as pd

crimes = pd.read_csv("data/crimes.csv")
arrests = pd.read_csv("data/arrests.csv")


#get a list of common columns that can be joined on
common_columns_list = crimes.columns.intersection(arrests.columns) \
                            .tolist()
print(common_columns_list)



['JURISDICTION_CODE', 'KY_CD', 'LAW_CAT_CD', 'OFNS_DESC', 'PD_CD', 'PD_DESC', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude', 'New Georeferenced Column']


In [2]:
#since the dataset is very big, subset to one type of crime/arrest only

crimes_offenses_admin = crimes[crimes['OFNS_DESC'] == 'OFFENSES AGAINST PUBLIC ADMINI']
print(crimes_offenses_admin.head())

arrests_offenses_admin = arrests[arrests['OFNS_DESC'] == 'OFFENSES AGAINST PUBLIC ADMINI']
print(arrests_offenses_admin.head())


     Unnamed: 0 CMPLNT_NUM  ADDR_PCT_CD    BORO_NM CMPLNT_FR_DT CMPLNT_FR_TM  \
89        42045  287132279           14  MANHATTAN   05/19/2024     19:50:00   
178       66225  281176073           17  MANHATTAN   01/26/2024     16:40:00   
180       69158  284128850          114     QUEENS   03/21/2024     12:20:00   
192       54537  288620433           28  MANHATTAN   06/16/2024     22:41:00   
458       39718  289042360           19  MANHATTAN   06/18/2024     23:00:00   

    CMPLNT_TO_DT CMPLNT_TO_TM CRM_ATPT_CPTD_CD HADEVELOPT  ...  \
89    05/19/2024     20:00:00        COMPLETED     (null)  ...   
178   01/26/2024     16:45:00        COMPLETED     (null)  ...   
180   03/21/2024     12:24:00        COMPLETED     (null)  ...   
192   06/16/2024     23:10:00        COMPLETED     (null)  ...   
458   06/19/2024     00:00:00        COMPLETED     (null)  ...   

     TRANSIT_DISTRICT  VIC_AGE_GROUP VIC_RACE  VIC_SEX X_COORD_CD Y_COORD_CD  \
89                NaN        UNKNOWN  UNKN

In [23]:
#merge based on PD_DESC column because the potential of exact matches is higher

offenses_admin_arrests_crimes = arrests_offenses_admin.merge(crimes_offenses_admin, how="left", left_on=['PD_DESC', 'AGE_GROUP', 'PERP_RACE', 'PERP_SEX'], right_on=['PD_DESC', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX'], suffixes=["_ar", "_cr"])


#deleting duplicates based on arrest key
offenses_admin_arrests_crimes.drop_duplicates(subset="ARREST_KEY")

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD_ar,PD_DESC,KY_CD_ar,OFNS_DESC_ar,LAW_CODE,LAW_CAT_CD_ar,ARREST_BORO,ARREST_PRECINCT,...,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD_cr,Y_COORD_CD_cr,Latitude_cr,Longitude_cr,Lat_Lon,New Georeferenced Column_cr
0,286561632,05/08/2024,744.0,BAIL JUMPING 3,359.0,OFFENSES AGAINST PUBLIC ADMINI,PL 2155500,M,Q,103,...,,UNKNOWN,UNKNOWN,E,1039647.0,195586.0,40.703342,-73.800203,"(40.70334171028012, -73.80020318978802)",POINT (-73.80020318978802 40.70334171028012)
12,282015653,02/10/2024,748.0,"CONTEMPT,CRIMINAL",359.0,OFFENSES AGAINST PUBLIC ADMINI,PL 2155003,M,B,44,...,,65+,ASIAN / PACIFIC ISLANDER,F,1027140.0,226922.0,40.789422,-73.845111,"(40.789422, -73.845111)",POINT (-73.845111 40.789422)
16,280084154,01/07/2024,748.0,"CONTEMPT,CRIMINAL",359.0,OFFENSES AGAINST PUBLIC ADMINI,PL 2155003,M,K,63,...,,18-24,UNKNOWN,F,960782.0,166273.0,40.623027,-74.084537,"(40.623027, -74.084537)",POINT (-74.084537 40.623027)
35,280604387,01/16/2024,759.0,"PUBLIC ADMINISTATION,UNCLASS M",359.0,OFFENSES AGAINST PUBLIC ADMINI,PL 1950500,M,B,47,...,,UNKNOWN,UNKNOWN,E,1004170.0,172827.0,40.641027,-73.928223,"(40.641027, -73.928223)",POINT (-73.928223 40.641027)
172,280051375,01/06/2024,759.0,"PUBLIC ADMINISTATION,UNCLASS M",359.0,OFFENSES AGAINST PUBLIC ADMINI,PL 1950500,M,M,1,...,,UNKNOWN,UNKNOWN,E,995232.0,177023.0,40.652557,-73.960422,"(40.65255706948917, -73.96042210748048)",POINT (-73.96042210748048 40.65255706948917)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201905,289273485,06/28/2024,750.0,RESISTING ARREST,359.0,OFFENSES AGAINST PUBLIC ADMINI,PL 2053000,M,M,14,...,,UNKNOWN,UNKNOWN,E,990312.0,210966.0,40.745727,-73.978123,"(40.74572740472728, -73.97812268196289)",POINT (-73.97812268196289 40.74572740472728)
201907,288988004,06/23/2024,759.0,"PUBLIC ADMINISTATION,UNCLASS M",359.0,OFFENSES AGAINST PUBLIC ADMINI,PL 1950500,M,K,77,...,,<18,ASIAN / PACIFIC ISLANDER,M,1054984.0,193094.0,40.696395,-73.744911,"(40.696395, -73.744911)",POINT (-73.744911 40.696395)
201914,289349537,06/30/2024,750.0,RESISTING ARREST,359.0,OFFENSES AGAINST PUBLIC ADMINI,PL 2053000,M,Q,103,...,,25-44,BLACK,M,1006964.0,245292.0,40.839916,-73.917910,"(40.839916, -73.91791)",POINT (-73.91791 40.839916)
201939,288147809,06/07/2024,744.0,BAIL JUMPING 3,359.0,OFFENSES AGAINST PUBLIC ADMINI,PL 2155500,M,Q,102,...,4.0,UNKNOWN,UNKNOWN,E,998736.0,227214.0,40.790314,-73.947686,"(40.7903141514773, -73.9476860633721)",POINT (-73.9476860633721 40.7903141514773)
