In [1]:
import pandas as pd

# Step 1: Initial cleaning and removing columns with too many nulls
df = pd.read_csv("sqf-2023.csv", na_values=["(null)"], low_memory=False)

# Remove columns with more than 40% missing values
threshold = 0.4 * len(df)
columns_to_remove = df.columns[df.isnull().sum() > threshold]
df_cleaned = df.loc[:, df.isnull().sum() <= threshold]

# Save initial cleaned data
df_cleaned.to_csv("sqf-2023-cleaned.csv", index=False)
print(f"Columns removed ({len(columns_to_remove)}): {columns_to_remove.tolist()}")
print(f"Remaining columns: {len(df_cleaned.columns)}")

# Step 2: Fill missing values
df_cleaned = pd.read_csv("sqf-2023-cleaned.csv", na_values=["(null)", ""], low_memory=False)

for col in df_cleaned.columns:
    if pd.api.types.is_numeric_dtype(df_cleaned[col]):
        mean_value = df_cleaned[col].mean()
        df_cleaned[col].fillna(mean_value, inplace=True)
        df_cleaned[col] = df_cleaned[col].round(2)
    else:
        mode = df_cleaned[col].mode()
        if not mode.empty:
            df_cleaned[col].fillna(mode[0], inplace=True)

# Binary encoding for specific columns
yes_no_columns = [
    'SUPERVISING_ACTION_CORRESPONDING_ACTIVITY_LOG_ENTRY_REVIEWED', 'OFFICER_EXPLAINED_STOP_FLAG',
    'OTHER_PERSON_STOPPED_FLAG', 'SUSPECT_ARRESTED_FLAG', 'SUMMONS_ISSUED_FLAG',
    'OFFICER_IN_UNIFORM_FLAG', 'FRISKED_FLAG', 'SEARCHED_FLAG', 'ASK_FOR_CONSENT_FLG',
    'CONSENT_GIVEN_FLG', 'OTHER_CONTRABAND_FLAG', 'WEAPON_FOUND_FLAG',
    'PHYSICAL_FORCE_VERBAL_INSTRUCTION_FLAG'
]
for col in yes_no_columns:
    if col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].map({'Y': 1, 'N': 0})

# Save updated data
df_cleaned.to_csv("sqf-2023-final-cleaned.csv", index=False)
print(f"Total remaining missing values: {df_cleaned.isnull().sum().sum()}")

# Step 3: Feature engineering
# Categorize time of day
def categorize_time(time_str):
    if time_str == "00:00:00":
        return 'Night'
    hour = int(time_str.split(':')[0])
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    return 'Night'

df_cleaned['TIME_OF_DAY'] = df_cleaned['STOP_FRISK_TIME'].apply(categorize_time)

# Categorize stop duration
def categorize_duration(duration):
    if duration <= 5:
        return 'Short'
    elif 6 <= duration <= 15:
        return 'Medium'
    return 'Long'

df_cleaned['STOP_DURATION_CATEGORY'] = df_cleaned['STOP_DURATION_MINUTES'].apply(categorize_duration)

# Calculate BMI
def feet_to_meters(feet):
    return feet * 0.3048

def lbs_to_kg(lbs):
    return lbs * 0.453592

df_cleaned['SUSPECT_WEIGHT_KG'] = df_cleaned['SUSPECT_WEIGHT'].apply(lbs_to_kg)
df_cleaned['SUSPECT_HEIGHT_M'] = df_cleaned['SUSPECT_HEIGHT'].apply(feet_to_meters)
df_cleaned['SUSPECT_BMI'] = (df_cleaned['SUSPECT_WEIGHT_KG'] / df_cleaned['SUSPECT_HEIGHT_M']**2).round(2)

def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Healthy Weight'
    elif 25 <= bmi < 30:
        return 'Overweight'
    elif 30 <= bmi < 35:
        return 'Class 1 Obesity'
    elif 35 <= bmi < 40:
        return 'Class 2 Obesity'
    return 'Class 3 Obesity'

df_cleaned['BMI_CATEGORY'] = df_cleaned['SUSPECT_BMI'].apply(categorize_bmi)

# Demeanor scoring
demeanor_mapping = {
    'CALM': 1, 'COOPERATIVE': 1, 'EVASIVE': -1,
    'UPSET': -1, 'YELLING': -2, 'VIGILANT': 0
}
df_cleaned['DEMEANOR_SCORE'] = df_cleaned['DEMEANOR_OF_PERSON_STOPPED'].map(demeanor_mapping).fillna(0)

# Compliance score
df_cleaned['COMPLIANCE_SCORE'] = (
    df_cleaned['DEMEANOR_SCORE'] +
    df_cleaned['CONSENT_GIVEN_FLG'] -
    df_cleaned['OTHER_CONTRABAND_FLAG']
)

# Combine frisk and arrest flags
df_cleaned['FRISKED_AND_ARRESTED'] = (
    df_cleaned['FRISKED_FLAG'] & df_cleaned['SUSPECT_ARRESTED_FLAG']
).astype(int)

def combine_flags_and(frisked, arrested):
    if frisked and arrested:
        return 'Frisked and Arrested'
    elif frisked:
        return 'Frisked Only'
    elif arrested:
        return 'Arrested Only'
    return 'Neither'

df_cleaned['FRISKED_AND_ARRESTED_CAT'] = df_cleaned.apply(
    lambda row: combine_flags_and(row['FRISKED_FLAG'], row['SUSPECT_ARRESTED_FLAG']), axis=1
)

# Save final cleaned data with features
df_cleaned.to_csv("sqf-2023-cleaned-with-features.csv", index=False)
print("Final cleaned dataset saved.")
print(df_cleaned[['TIME_OF_DAY', 'STOP_DURATION_CATEGORY', 'SUSPECT_BMI', 
                  'BMI_CATEGORY', 'DEMEANOR_SCORE', 'COMPLIANCE_SCORE',
                  'FRISKED_AND_ARRESTED', 'FRISKED_AND_ARRESTED_CAT']].head(10))


Columns removed (36): ['JURISDICTION_CODE', 'JURISDICTION_DESCRIPTION', 'OFFICER_NOT_EXPLAINED_STOP_DESCRIPTION', 'SUSPECT_ARREST_OFFENSE', 'SUMMONS_OFFENSE_DESCRIPTION', 'ID_CARD_IDENTIFIES_OFFICER_FLAG', 'SHIELD_IDENTIFIES_OFFICER_FLAG', 'VERBAL_IDENTIFIES_OFFICER_FLAG', 'FIREARM_FLAG', 'KNIFE_CUTTER_FLAG', 'OTHER_WEAPON_FLAG', 'PHYSICAL_FORCE_CEW_FLAG', 'PHYSICAL_FORCE_DRAW_POINT_FIREARM_FLAG', 'PHYSICAL_FORCE_HANDCUFF_SUSPECT_FLAG', 'PHYSICAL_FORCE_OC_SPRAY_USED_FLAG', 'PHYSICAL_FORCE_OTHER_FLAG', 'PHYSICAL_FORCE_RESTRAINT_USED_FLAG', 'PHYSICAL_FORCE_WEAPON_IMPACT_FLAG', 'BACKROUND_CIRCUMSTANCES_VIOLENT_CRIME_FLAG', 'BACKROUND_CIRCUMSTANCES_SUSPECT_KNOWN_TO_CARRY_WEAPON_FLAG', 'SUSPECTS_ACTIONS_CASING_FLAG', 'SUSPECTS_ACTIONS_CONCEALED_POSSESSION_WEAPON_FLAG', 'SUSPECTS_ACTIONS_DECRIPTION_FLAG', 'SUSPECTS_ACTIONS_DRUG_TRANSACTIONS_FLAG', 'SUSPECTS_ACTIONS_IDENTIFY_CRIME_PATTERN_FLAG', 'SUSPECTS_ACTIONS_LOOKOUT_FLAG', 'SUSPECTS_ACTIONS_OTHER_FLAG', 'SUSPECTS_ACTIONS_PROXIMITY_TO_SCE

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna(mode[0], inplace=True)


Total remaining missing values: 0
Final cleaned dataset saved.
  TIME_OF_DAY STOP_DURATION_CATEGORY  SUSPECT_BMI     BMI_CATEGORY  \
0       Night                 Medium        19.68   Healthy Weight   
1       Night                  Short        22.54   Healthy Weight   
2     Morning                  Short        22.48   Healthy Weight   
3       Night                  Short        37.30  Class 2 Obesity   
4     Morning                 Medium        25.77       Overweight   
5     Evening                   Long        19.03   Healthy Weight   
6       Night                  Short        28.46       Overweight   
7       Night                  Short        20.76   Healthy Weight   
8       Night                   Long        27.63       Overweight   
9     Evening                 Medium        37.54  Class 2 Obesity   

   DEMEANOR_SCORE  COMPLIANCE_SCORE  FRISKED_AND_ARRESTED  \
0             0.0               0.0                     0   
1             0.0               0.0         

In [2]:
#modeling base

X = df_cleaned[[
    'AGE_GROUP_18-30', 
    'AGE_GROUP_31-45',
    'AGE_GROUP_46-60', 
    'AGE_GROUP_60+', 
    'SUSPECT_SEX_FEMALE',
    'SUSPECT_SEX_MALE',
    'SUSPECT_RACE_DESCRIPTION_AMERICAN INDIAN/ALASKAN NATIVE',
    'SUSPECT_RACE_DESCRIPTION_ASIAN / PACIFIC ISLANDER',
    'SUSPECT_RACE_DESCRIPTION_BLACK',
    'SUSPECT_RACE_DESCRIPTION_BLACK HISPANIC',
    'SUSPECT_RACE_DESCRIPTION_MIDDLE EASTERN/SOUTHWEST ASIAN',
    'SUSPECT_RACE_DESCRIPTION_WHITE',
    'SUSPECT_RACE_DESCRIPTION_WHITE HISPANIC',
    'TIME_OF_DAY_Evening',
    'TIME_OF_DAY_Morning', 
    'TIME_OF_DAY_Night',
    'STOP_DURATION_MINUTES',
    'STOP_LOCATION_BORO_NAME_BRONX',
    'STOP_LOCATION_BORO_NAME_BROOKLYN',
    'STOP_LOCATION_BORO_NAME_MANHATTAN',
    'STOP_LOCATION_BORO_NAME_QUEENS',
    'STOP_LOCATION_BORO_NAME_STATEN ISLAND',
    'OFFICER_EXPLAINED_STOP_FLAG',
    'OFFICER_IN_UNIFORM_FLAG',
    'OTHER_PERSON_STOPPED_FLAG',
    'DEMEANOR_SCORE',
    'COMPLIANCE_SCORE',
    'SUSPECTED_CRIME_DESCRIPTION_ASSAULT',
    'SUSPECTED_CRIME_DESCRIPTION_AUTO STRIPPIG',
    'SUSPECTED_CRIME_DESCRIPTION_BURGLARY',
    'SUSPECTED_CRIME_DESCRIPTION_CPSP',
    'SUSPECTED_CRIME_DESCRIPTION_CPW',
    'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL MISCHIEF',
    'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL POSSESSION OF CONTROLLED SUBSTANCE',
    'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL POSSESSION OF FORGED INSTRUMENT',
    'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL SALE OF CONTROLLED SUBSTANCE',
    'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL SALE OF MARIHUANA',
    'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL TRESPASS',
    'SUSPECTED_CRIME_DESCRIPTION_FORCIBLE TOUCHING',
    'SUSPECTED_CRIME_DESCRIPTION_GRAND LARCENY',
    'SUSPECTED_CRIME_DESCRIPTION_GRAND LARCENY AUTO',
    'SUSPECTED_CRIME_DESCRIPTION_MAKING GRAFFITI',
    'SUSPECTED_CRIME_DESCRIPTION_MENACING',
    'SUSPECTED_CRIME_DESCRIPTION_MURDER',
    'SUSPECTED_CRIME_DESCRIPTION_OTHER',
    'SUSPECTED_CRIME_DESCRIPTION_PETIT LARCENY',
    'SUSPECTED_CRIME_DESCRIPTION_RAPE',
    'SUSPECTED_CRIME_DESCRIPTION_RECKLESS ENDANGERMENT',
    'SUSPECTED_CRIME_DESCRIPTION_ROBBERY',
    'SUSPECTED_CRIME_DESCRIPTION_TERRORISM',
    'SUSPECTED_CRIME_DESCRIPTION_THEFT OF SERVICES',
    'SUSPECTED_CRIME_DESCRIPTION_UNAUTHORIZED USE OF A VEHICLE'
]]


y = df_cleaned['FRISKED_AND_ARRESTED']


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


KeyError: "['AGE_GROUP_18-30', 'AGE_GROUP_31-45', 'AGE_GROUP_46-60', 'AGE_GROUP_60+', 'SUSPECT_SEX_FEMALE', 'SUSPECT_SEX_MALE', 'SUSPECT_RACE_DESCRIPTION_AMERICAN INDIAN/ALASKAN NATIVE', 'SUSPECT_RACE_DESCRIPTION_ASIAN / PACIFIC ISLANDER', 'SUSPECT_RACE_DESCRIPTION_BLACK', 'SUSPECT_RACE_DESCRIPTION_BLACK HISPANIC', 'SUSPECT_RACE_DESCRIPTION_MIDDLE EASTERN/SOUTHWEST ASIAN', 'SUSPECT_RACE_DESCRIPTION_WHITE', 'SUSPECT_RACE_DESCRIPTION_WHITE HISPANIC', 'TIME_OF_DAY_Evening', 'TIME_OF_DAY_Morning', 'TIME_OF_DAY_Night', 'STOP_LOCATION_BORO_NAME_BRONX', 'STOP_LOCATION_BORO_NAME_BROOKLYN', 'STOP_LOCATION_BORO_NAME_MANHATTAN', 'STOP_LOCATION_BORO_NAME_QUEENS', 'STOP_LOCATION_BORO_NAME_STATEN ISLAND', 'SUSPECTED_CRIME_DESCRIPTION_ASSAULT', 'SUSPECTED_CRIME_DESCRIPTION_AUTO STRIPPIG', 'SUSPECTED_CRIME_DESCRIPTION_BURGLARY', 'SUSPECTED_CRIME_DESCRIPTION_CPSP', 'SUSPECTED_CRIME_DESCRIPTION_CPW', 'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL MISCHIEF', 'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL POSSESSION OF CONTROLLED SUBSTANCE', 'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL POSSESSION OF FORGED INSTRUMENT', 'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL SALE OF CONTROLLED SUBSTANCE', 'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL SALE OF MARIHUANA', 'SUSPECTED_CRIME_DESCRIPTION_CRIMINAL TRESPASS', 'SUSPECTED_CRIME_DESCRIPTION_FORCIBLE TOUCHING', 'SUSPECTED_CRIME_DESCRIPTION_GRAND LARCENY', 'SUSPECTED_CRIME_DESCRIPTION_GRAND LARCENY AUTO', 'SUSPECTED_CRIME_DESCRIPTION_MAKING GRAFFITI', 'SUSPECTED_CRIME_DESCRIPTION_MENACING', 'SUSPECTED_CRIME_DESCRIPTION_MURDER', 'SUSPECTED_CRIME_DESCRIPTION_OTHER', 'SUSPECTED_CRIME_DESCRIPTION_PETIT LARCENY', 'SUSPECTED_CRIME_DESCRIPTION_RAPE', 'SUSPECTED_CRIME_DESCRIPTION_RECKLESS ENDANGERMENT', 'SUSPECTED_CRIME_DESCRIPTION_ROBBERY', 'SUSPECTED_CRIME_DESCRIPTION_TERRORISM', 'SUSPECTED_CRIME_DESCRIPTION_THEFT OF SERVICES', 'SUSPECTED_CRIME_DESCRIPTION_UNAUTHORIZED USE OF A VEHICLE'] not in index"