In [79]:
import pandas as pd

data = pd.read_csv('C:/Users/Sebastian/Desktop/Hack404/backend/exploration/major_crime_data.csv')

FOI = [
    'OCC_DOW',
    'OCC_HOUR',
    'UCR_CODE',
    'UCR_EXT',
    'MCI_CATEGORY',
    'HOOD_158',
    'NEIGHBOURHOOD_158',
    'LONG_WGS84',
    'LAT_WGS84'
]
data = data[FOI]
data = data.astype({
    'HOOD_158': str,
    'OCC_HOUR': int,
    'UCR_CODE': int,
    'UCR_EXT': int,
    'LONG_WGS84': float,
    'LAT_WGS84': float
})

data = data[data['HOOD_158'] != 'NSA']
data.head()

Unnamed: 0,OCC_DOW,OCC_HOUR,UCR_CODE,UCR_EXT,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,LONG_WGS84,LAT_WGS84
0,Thursday,0,1420,110,Assault,120,Clairlea-Birchmount (120),-79.286725,43.7001
1,Wednesday,3,1410,100,Assault,81,Trinity-Bellwoods (81),-79.410994,43.649686
2,Wednesday,0,1420,100,Assault,55,Thorncliffe Park (55),-79.346615,43.703234
3,Wednesday,1,1430,100,Assault,143,West Rouge (143),-79.132915,43.780413
4,Wednesday,1,1430,100,Assault,143,West Rouge (143),-79.132915,43.780413


In [80]:
unique_coords = data.drop_duplicates(subset=['LONG_WGS84', 'LAT_WGS84'], keep=False)
unique_coords.head()

Unnamed: 0,OCC_DOW,OCC_HOUR,UCR_CODE,UCR_EXT,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,LONG_WGS84,LAT_WGS84
1787,Thursday,12,2120,200,Break and Enter,39,Bedford Park-Nortown (39),-79.41469,43.749232
2104,Friday,16,2120,200,Break and Enter,107,Oakwood Village (107),-79.431891,43.68357
2200,Tuesday,21,2120,200,Break and Enter,16,Stonegate-Queensway (16),-79.503583,43.637643
2210,Wednesday,14,2120,200,Break and Enter,11,Eringate-Centennial-West Deane (11),-79.572871,43.664041
2603,Saturday,19,2120,200,Break and Enter,120,Clairlea-Birchmount (120),-79.283781,43.705473


In [None]:
import numpy as np
SORTED_LONGS = unique_coords['LONG_WGS84'].sort_values(ascending=True)

def get_closest(df: pd.DataFrame, long):
    differences = np.abs(df.values - long)
    closest_position = np.argmin(differences)
    result = df.iloc[closest_position]

    return result

def get_closest_intersection(long, lat):
    closest_long = get_closest(SORTED_LONGS, long)
    closest_lat = get_closest(data[data['LONG_WGS84'] == closest_long]['LAT_WGS84'], lat)

    return data[(data['LONG_WGS84'] == closest_long) & (data['LAT_WGS84'] == closest_lat)]

def get_hood_records(row):
    hood = row['HOOD_158'].iloc[0]
    return data[data['HOOD_158'] == hood]

# row = get_closest_intersection(-79.4, 43.2)
# hood_records = get_hood_records(row)

category_weights = {
    'Assault': 0.3,
    'Break and Enter': 0.1,
    'Auto Theft': 0.05,
    'Robbery': 0.4,
    'Theft Over': 0.15
}

ucr_weights = {
    # Assault codes (1400-1499) - High kidnapping risk
    1410: 0.25,  # Assault with weapon (3,121 occurrences)
    1420: 0.3,   # Assault causing bodily harm (49,154 occurrences) - Most common assault
    1430: 0.35,  # Aggravated assault (157,632 occurrences) - Most common crime overall
    1440: 0.4,   # Sexual assault (20 occurrences) - Very high risk
    1450: 0.45,  # Sexual assault with weapon (4,561 occurrences) - Very high risk
    1455: 0.5,   # Sexual assault causing bodily harm (230 occurrences) - Very high risk
    1457: 0.55,  # Aggravated sexual assault (1,558 occurrences) - Extremely high risk
    1460: 0.6,   # Sexual assault with weapon causing bodily harm (7,482 occurrences) - Extremely high risk
    1461: 0.65,  # Aggravated sexual assault with weapon (846 occurrences) - Extremely high risk
    1462: 0.7,   # Sexual assault causing bodily harm with weapon (31 occurrences) - Extremely high risk
    1470: 0.75,  # Aggravated sexual assault causing bodily harm (149 occurrences) - Extremely high risk
    1475: 0.8,   # Sexual assault with weapon causing bodily harm (2 occurrences) - Extremely high risk
    1480: 0.85,  # Aggravated sexual assault with weapon causing bodily harm (4,451 occurrences) - Extremely high risk
    
    # Break and Enter codes (2100-2199) - Lower kidnapping risk
    2120: 0.1,   # Break and Enter (78,584 occurrences) - Most common property crime
    2121: 0.12,  # Break and Enter with weapon (22 occurrences) - Slightly higher risk
    2125: 0.15,  # Break and Enter causing bodily harm (13 occurrences) - Higher risk
    2130: 0.08,  # Break and Enter (10,461 occurrences) - Standard B&E
    2132: 0.1,   # Break and Enter with weapon (3,597 occurrences) - Slightly higher risk
    2133: 0.12,  # Break and Enter causing bodily harm (749 occurrences) - Higher risk
    2135: 0.08,  # Break and Enter (70,163 occurrences) - Standard B&E
    
    # Auto Theft codes (1600-1699) - Very low kidnapping risk
    1610: 0.05,  # Auto Theft (37,786 occurrences) - Most common auto theft
    1611: 0.06,  # Auto Theft with weapon (2 occurrences) - Slightly higher risk
}

def calc_hood_risk(hood_record):
    total_crimes = len(hood_record)
    
    # Calculate weighted crime score
    weighted_score = 0.0
    
    # Calculate MCI category risk
    mci_counts = hood_record['MCI_CATEGORY'].value_counts()
    for category, count in mci_counts.items():
        weight = category_weights.get(category, 0.1)  # Default weight for any unexpected categories
        weighted_score += weight * count
    
    # Calculate UCR code risk (higher precision)
    ucr_counts = hood_record['UCR_CODE'].value_counts()
    for ucr_code, count in ucr_counts.items():
        weight = ucr_weights.get(ucr_code, 0.05)  # Default weight for unknown codes
        weighted_score += weight * count * 1.5  # UCR codes get higher weight as they're more specific
    
    # Calculate crime rate factor (higher total crimes = higher risk)
    crime_rate_factor = min(3.0, 1 + (total_crimes / 50))  # Caps at 4x multiplier
    
    # Calculate time-based risk factors
    night_crimes = len(hood_record[hood_record['OCC_HOUR'].isin([22, 23, 0, 1, 2, 3, 4, 5])])
    night_factor = (night_crimes / total_crimes) * 0.3 if total_crimes > 0 else 0
    
    weekend_days = ['Friday', 'Saturday', 'Sunday']
    weekend_crimes = len(hood_record[hood_record['OCC_DOW'].isin(weekend_days)])
    weekend_factor = (weekend_crimes / total_crimes) * 0.2 if total_crimes > 0 else 0
    
    # Calculate final score
    # Base score from weighted crimes (normalized by total crimes)
    base_score = (weighted_score / max(total_crimes, 1)) * 40  # Scale to 0-40
    
    # Apply crime rate multiplier
    rate_adjusted_score = base_score * crime_rate_factor
    
    # Add time factors
    final_score = min(100, rate_adjusted_score + night_factor + weekend_factor)
    
    return round(final_score, 2)

hoods = sorted(data['HOOD_158'].unique())

risks = {}
max_risk = 0
min_risk = 100
for hood in hoods:
    risk = calc_hood_risk(data[data['HOOD_158'] == hood])
    risks[hood] = risk
    max_risk = max(risk, max_risk)
    min_risk = min(risk, min_risk)

    # print(hood, risks[hood])

for key, value in risks.items():
    mapped_risk = round(np.interp(value, [min_risk, max_risk], [5, 99]))
    risks[key] = mapped_risk

import json

with open('neighbourhood_risks.json', 'w') as f:
    json.dump(risks, f, indent=2)






In [101]:
coord_hoods = [[row['LAT_WGS84'], row['LONG_WGS84'], row['HOOD_158']] for _, row in unique_coords.iterrows()]

with open('coords_hood_map.json', 'w') as f:
    json.dump(coord_hoods, f, indent=2)

In [98]:
unique_coords.value_counts()

OCC_DOW     OCC_HOUR  UCR_CODE  UCR_EXT  MCI_CATEGORY     HOOD_158  NEIGHBOURHOOD_158                     LONG_WGS84  LAT_WGS84
Wednesday   23        2135      210      Auto Theft       133       Centennial Scarborough (133)          -79.143100  43.795010    1
Friday      0         1430      100      Assault          100       Yonge-Eglinton (100)                  -79.405342  43.703254    1
                                                          111       Rockcliffe-Smythe (111)               -79.498951  43.669992    1
                                                          160       Mimico-Queensway (160)                -79.491535  43.612657    1
                      2120      200      Break and Enter  007       Willowridge-Martingrove-Richview (7)  -79.541382  43.682903    1
                                                                                                                                  ..
            1         2135      210      Auto Theft       158       Isling