In [7]:
import pandas as pd

# Load the datasets (update paths to local files)
arrests_df = pd.read_csv('arrests.csv')
crimes_df = pd.read_csv('crimes.csv')

# Standardize column names for the merge
arrests_df.rename(columns={'ARREST_BORO': 'BORO_NM', 'ARREST_DATE': 'CMPLNT_FR_DT'}, inplace=True)

# Map abbreviations to full borough names
borough_mapping = {
    'K': 'BROOKLYN',
    'Q': 'QUEENS',
    'S': 'STATEN ISLAND',
    'B': 'BRONX',
    'M': 'MANHATTAN'
}

# Apply the mapping to the Arrests DataFrame
arrests_df['BORO_NM'] = arrests_df['BORO_NM'].map(borough_mapping)

# Convert dates to datetime format, with error handling for out-of-bounds dates
arrests_df['CMPLNT_FR_DT'] = pd.to_datetime(arrests_df['CMPLNT_FR_DT'], format='%m/%d/%Y')
crimes_df['CMPLNT_FR_DT'] = pd.to_datetime(crimes_df['CMPLNT_FR_DT'], format='%m/%d/%Y', errors='coerce')

# Drop rows with invalid or missing dates
crimes_df_clean = crimes_df.dropna(subset=['CMPLNT_FR_DT'])

# Round Latitude and Longitude to 4 decimal places
crimes_df_clean = crimes_df_clean.copy()  # Create a copy to avoid SettingWithCopyWarning
crimes_df_clean.loc[:, 'Latitude'] = crimes_df_clean['Latitude'].round(4)
crimes_df_clean.loc[:, 'Longitude'] = crimes_df_clean['Longitude'].round(4)

# Convert only the date part for matching (ignore time)
arrests_df['CMPLNT_FR_DT'] = arrests_df['CMPLNT_FR_DT'].dt.date
crimes_df_clean['CMPLNT_FR_DT'] = crimes_df_clean['CMPLNT_FR_DT'].dt.date

# Check unique values in the joining fields for both DataFrames
print("Unique BORO_NM in Arrests:")
print(arrests_df['BORO_NM'].unique())

print("\nUnique BORO_NM in Crimes:")
print(crimes_df_clean['BORO_NM'].unique())

# Merge the datasets on borough, date, and location (latitude, longitude)
merged_df = pd.merge(
    arrests_df,
    crimes_df_clean,
    on=['BORO_NM', 'CMPLNT_FR_DT', 'Latitude', 'Longitude'],
    how='inner'  # Inner join to get only matching records
)

# Display the rows of the merged data
print("\nMerged DataFrame:")
print(merged_df)

# Check if the merged DataFrame is empty
if merged_df.empty:
    print("The merged DataFrame is empty. Please check the unique values printed above for discrepancies.")
else:
    print("Merge was successful, data is available.")

Unique BORO_NM in Arrests:
['BROOKLYN' 'QUEENS' 'STATEN ISLAND' 'BRONX' 'MANHATTAN']

Unique BORO_NM in Crimes:
['BROOKLYN' 'MANHATTAN' 'BRONX' 'QUEENS' 'STATEN ISLAND' '(null)']

Merged DataFrame:
   ARREST_KEY CMPLNT_FR_DT  PD_CD_x                    PD_DESC_x  KY_CD_x  \
0   287966413   2024-06-04    515.0  CONTROLLED SUBSTANCE,SALE 3    117.0   
1   288845944   2024-06-20    515.0  CONTROLLED SUBSTANCE,SALE 3    117.0   

       OFNS_DESC_x    LAW_CODE LAW_CAT_CD_x   BORO_NM  ARREST_PRECINCT  ...  \
0  DANGEROUS DRUGS  PL 2203901            F  BROOKLYN               73  ...   
1  DANGEROUS DRUGS  PL 2203901            F  BROOKLYN               73  ...   

   SUSP_RACE SUSP_SEX TRANSIT_DISTRICT VIC_AGE_GROUP  VIC_RACE  VIC_SEX  \
0      BLACK        M              NaN       UNKNOWN   UNKNOWN        E   
1      BLACK        M              NaN       UNKNOWN   UNKNOWN        E   

   X_COORD_CD_y  Y_COORD_CD_y              Lat_Lon  New Georeferenced Column_y  
0     1011131.0      1827