In [1]:
import pandas as pd
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
total_crimes_count = pd.read_csv('..//data/processed_data/crime_total_count.csv')
total_crimes_count

Unnamed: 0.1,Unnamed: 0,HOOD_ID,AREA_NAME,Year,Total_Crimes
0,0,1,West Humber-Clairville,2014,1248.0
1,1,1,West Humber-Clairville,2015,1114.0
2,2,1,West Humber-Clairville,2016,1131.0
3,3,1,West Humber-Clairville,2017,1172.0
4,4,1,West Humber-Clairville,2018,1545.0
...,...,...,...,...,...
1733,1733,174,South Eglinton-Davisville,2020,312.0
1734,1734,174,South Eglinton-Davisville,2021,236.0
1735,1735,174,South Eglinton-Davisville,2022,233.0
1736,1736,174,South Eglinton-Davisville,2023,251.0


In [3]:
total_crimes_count.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1738 entries, 0 to 1737
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1738 non-null   int64  
 1   HOOD_ID       1738 non-null   int64  
 2   AREA_NAME     1738 non-null   object 
 3   Year          1738 non-null   int64  
 4   Total_Crimes  1738 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 68.0+ KB


In [173]:
#total_crimes_count['Year'] = pd.to_datetime(total_crimes_count['Year'], format='%Y')


In [174]:
#total_crimes_count = total_crimes_count.drop('AREA_NAME', axis=1)
#total_crimes_count = total_crimes_count.set_index('Year')

In [4]:
total_crimes_count.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1738 entries, 0 to 1737
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1738 non-null   int64  
 1   HOOD_ID       1738 non-null   int64  
 2   AREA_NAME     1738 non-null   object 
 3   Year          1738 non-null   int64  
 4   Total_Crimes  1738 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 68.0+ KB


In [5]:
all_predictions = pd.DataFrame()
hood_ids = total_crimes_count['HOOD_ID'].unique()

hood_ids

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        15,  16,  18,  19,  20,  21,  22,  23,  24,  25,  27,  28,  29,
        30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
        43,  44,  46,  47,  48,  49,  50,  52,  53,  54,  55,  56,  57,
        58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
        71,  72,  73,  74,  78,  79,  80,  81,  83,  84,  85,  86,  87,
        88,  89,  90,  91,  92,  94,  95,  96,  97,  98,  99, 100, 101,
       102, 103, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 129, 130,
       133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145, 146,
       147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
       160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
       173, 174])

In [None]:
# testing with best order
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100

all_predictions = pd.DataFrame()
total_maes = []
total_rmses = []
total_mapes = []

for hood_id in hood_ids:
    neighborhood_data = total_crimes_count[total_crimes_count['HOOD_ID'] == hood_id].copy()
    neighborhood_data = neighborhood_data.set_index('Year')
    neighborhood_data.index = neighborhood_data.index.astype(int)
    total_crimes_series = neighborhood_data['Total_Crimes']
    
    train_data = total_crimes_series.loc[total_crimes_series.index <= 2023]
    test_data = total_crimes_series.loc[total_crimes_series.index > 2023]
    
    auto_model = pm.auto_arima(train_data,
                           start_p=1, start_q=1,
                           max_p=12, max_q=12,
                           m=1, 
                           seasonal=False,
                           d=None, 
                           trace=True,
                           error_action='ignore',
                           suppress_warnings=True,
                           stepwise=True,
                           trend = 't')
    best_order = auto_model.order

    model = ARIMA(train_data, order=best_order)
    fitted_model = model.fit()
    start_year = test_data.index[0]
    end_year = test_data.index[-1]
    predictions = fitted_model.predict(start=start_year, end=end_year)

    mae = mean_absolute_error(test_data, predictions)
    rmse = np.sqrt(mean_squared_error(test_data, predictions))
    mape = mean_absolute_percentage_error(test_data, predictions)

    total_maes.append(mae)
    total_rmses.append(rmse)
    total_mapes.append(mape)

    evaluation_df = predictions.to_frame(name='Predicted_Crimes')
    evaluation_df['HOOD_ID'] = hood_id
    evaluation_df['Actual_Crimes'] = test_data 
    evaluation_df['MAE'] = mae
    evaluation_df['RMSE'] = rmse
    evaluation_df['MAPE'] = mape

    all_predictions = pd.concat([all_predictions, evaluation_df])
    
if total_maes:
   total_mae = np.mean(total_maes)
   total_rmse = np.mean(total_rmses)
   total_mape = np.mean(total_mapes)
   print(all_predictions)
   print(f"\nMean Absolute Error (MAE): {total_mae:.2f}")
   print(f"Root Mean Squared Error (RMSE): {total_rmse:.2f}")
   print(f"Mean Absolute Percentage Error (MAPE): {total_mape:.2f}%")
else:
    print("\nNo models were successfully fitted and evaluated.")

In [7]:
all_predictions.to_csv('ARIMA_prediction_total_crimes_per_year_train.csv')
all_predictions

Unnamed: 0,Predicted_Crimes,HOOD_ID,Actual_Crimes,MAE,RMSE,MAPE
2024,1437.073260,1,1569.0,131.926740,131.926740,8.408333
2024,549.133147,2,619.0,69.866853,69.866853,11.287052
2024,171.668778,3,192.0,20.331222,20.331222,10.589178
2024,189.532583,4,222.0,32.467417,32.467417,14.624963
2024,183.000000,5,225.0,42.000000,42.000000,18.666667
...,...,...,...,...,...,...
2024,1041.148361,170,1071.0,29.851639,29.851639,2.787268
2024,435.740312,171,454.0,18.259688,18.259688,4.021958
2024,260.178643,172,278.0,17.821357,17.821357,6.410560
2024,181.449784,173,218.0,36.550216,36.550216,16.766154


In [8]:
total_mae, total_rmse, total_mape

(53.48286811813191, 53.48286811813191, 14.709451429556845)

In [None]:
# prediction for 2025:

predictions_2025 = pd.DataFrame()
for hood_id in hood_ids:
    neighborhood_data = total_crimes_count[total_crimes_count['HOOD_ID'] == hood_id].copy()
    
    if 'Year' not in neighborhood_data.columns:
        total_crimes_count = total_crimes_count.reset_index()
    
    neighborhood_data['Year'] = pd.to_datetime(neighborhood_data['Year'], format='%Y')
    neighborhood_data = neighborhood_data.set_index('Year')
    neighborhood_data.index = neighborhood_data.index.astype(int)
    total_crimes_series = neighborhood_data['Total_Crimes']
    train_data = total_crimes_series
    
    auto_model = pm.auto_arima(train_data,
                           start_p=1, start_q=1,
                           max_p=12, max_q=12,
                           m=1, 
                           seasonal=False,
                           d=None, 
                           trace=True,
                           error_action='ignore',
                           suppress_warnings=True,
                           stepwise=True,
                           trend = 't')
    best_order = auto_model.order
    
    model = ARIMA(train_data, order=best_order)
    fitted_model = model.fit()
    predictions = fitted_model.forecast(steps=1)

    predictions_df = pd.DataFrame({
            'HOOD_ID': [hood_id],
            'Year': [2025],
            'Predicted_Crimes': [predictions.iloc[0]]
        })
    predictions_2025 = pd.concat([predictions_2025, predictions_df], ignore_index=True)
        
predictions_2025


In [10]:
predictions_2025

Unnamed: 0,HOOD_ID,Year,Predicted_Crimes
0,1,2025,1500.643065
1,2,2025,573.419138
2,3,2025,161.543635
3,4,2025,211.208866
4,5,2025,218.441191
...,...,...,...
153,170,2025,1050.043406
154,171,2025,444.244857
155,172,2025,271.113153
156,173,2025,209.928264


In [11]:
predictions_2025['Predicted_Crimes'] = predictions_2025['Predicted_Crimes'].astype(int)
predictions_2025.to_csv('ARIMA_prediction_crimes_2025.csv')
predictions_2025

Unnamed: 0,HOOD_ID,Year,Predicted_Crimes
0,1,2025,1500
1,2,2025,573
2,3,2025,161
3,4,2025,211
4,5,2025,218
...,...,...,...
153,170,2025,1050
154,171,2025,444
155,172,2025,271
156,173,2025,209


In [18]:
hood_name_lookup = total_crimes_count[['HOOD_ID', 'AREA_NAME']].drop_duplicates()
predictions_df_with_nood_name = pd.merge(
    predictions_2025, 
    hood_name_lookup, 
    on='HOOD_ID', 
    how='left'
)
predictions_df_with_nood_name


Unnamed: 0,HOOD_ID,Year,Predicted_Crimes,AREA_NAME
0,1,2025,1500,West Humber-Clairville
1,2,2025,573,Mount Olive-Silverstone-Jamestown
2,3,2025,161,Thistletown-Beaumond Heights
3,4,2025,211,Rexdale-Kipling
4,5,2025,218,Elms-Old Rexdale
...,...,...,...,...
153,170,2025,1050,Yonge-Bay Corridor
154,171,2025,444,Junction-Wallace Emerson
155,172,2025,271,Dovercourt Village
156,173,2025,209,North Toronto


In [21]:
predictions_df_with_nood_name = predictions_df_with_nood_name.rename(columns={'Predicted_Crimes': 'Total_Crimes'})
predictions_df_with_nood_name['Data_Type'] = 'Forecast'
total_crimes_count['Data_Type'] = 'Historical'
combined_df = pd.concat([total_crimes_count, predictions_df_with_nood_name], ignore_index=True)
combined_df

Unnamed: 0.1,Unnamed: 0,HOOD_ID,AREA_NAME,Year,Total_Crimes,Data_Type
0,0.0,1,West Humber-Clairville,2014,1248.0,Historical
1,1.0,1,West Humber-Clairville,2015,1114.0,Historical
2,2.0,1,West Humber-Clairville,2016,1131.0,Historical
3,3.0,1,West Humber-Clairville,2017,1172.0,Historical
4,4.0,1,West Humber-Clairville,2018,1545.0,Historical
...,...,...,...,...,...,...
1891,,170,Yonge-Bay Corridor,2025,1050.0,Forecast
1892,,171,Junction-Wallace Emerson,2025,444.0,Forecast
1893,,172,Dovercourt Village,2025,271.0,Forecast
1894,,173,North Toronto,2025,209.0,Forecast


In [23]:
combined_df.to_csv('crime_per_hood_with_forecast_for_2025.csv', index=False)