In [6]:
import pandas as pd
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
total_crimes_by_type = pd.read_csv('..//data/processed_data/crime_by_type.csv')
total_crimes_by_type

Unnamed: 0.1,Unnamed: 0,Crime Type,Year,Total_Crimes
0,0,ASSAULT,2014,16515.0
1,1,ASSAULT,2015,17858.0
2,2,ASSAULT,2016,18608.0
3,3,ASSAULT,2017,18906.0
4,4,ASSAULT,2018,19565.0
...,...,...,...,...
94,94,THEFTOVER,2020,1209.0
95,95,THEFTOVER,2021,1052.0
96,96,THEFTOVER,2022,1444.0
97,97,THEFTOVER,2023,1719.0


In [8]:
total_crimes_by_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    99 non-null     int64  
 1   Crime Type    99 non-null     object 
 2   Year          99 non-null     int64  
 3   Total_Crimes  99 non-null     float64
dtypes: float64(1), int64(2), object(1)
memory usage: 3.2+ KB


In [9]:
total_crimes_by_type['Crime Type'] = total_crimes_by_type['Crime Type'].astype('string')
total_crimes_by_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    99 non-null     int64  
 1   Crime Type    99 non-null     string 
 2   Year          99 non-null     int64  
 3   Total_Crimes  99 non-null     float64
dtypes: float64(1), int64(2), string(1)
memory usage: 3.2 KB


In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100

all_predictions = pd.DataFrame()
total_maes = []
total_rmses = []
total_mapes = []
crime_types = total_crimes_by_type['Crime Type'].unique()

for crime in crime_types:
    crime_type_data = total_crimes_by_type[total_crimes_by_type['Crime Type'] == crime].copy()
    crime_type_data = crime_type_data.set_index('Year')
    crime_type_data.index = crime_type_data.index.astype(int)
    total_crimes_series = crime_type_data['Total_Crimes']
    
    train_data = total_crimes_series.loc[total_crimes_series.index <= 2023]
    test_data = total_crimes_series.loc[total_crimes_series.index > 2023]

    auto_model = pm.auto_arima(train_data,
                           start_p=1, start_q=1,
                           max_p=10, max_q=10,
                           m=1, 
                           seasonal=False,
                           d=None, 
                           trace=True,
                           error_action='ignore',
                           suppress_warnings=True,
                           stepwise=True)
    best_order = auto_model.order

    model = ARIMA(train_data, order=best_order)
    fitted_model = model.fit()
    start_year = test_data.index[0]
    end_year = test_data.index[-1]

    predictions = fitted_model.predict(start=start_year, end=end_year)
    predictions_df = predictions.to_frame(name='Predicted_Crimes')
    predictions_df['Crime_Type'] = crime

    mae = mean_absolute_error(test_data, predictions)
    rmse = np.sqrt(mean_squared_error(test_data, predictions))
    mape = mean_absolute_percentage_error(test_data, predictions)

    total_maes.append(mae)
    total_rmses.append(rmse)
    total_mapes.append(mape)

    evaluation_df = predictions.to_frame(name='Predicted_Crimes')
    evaluation_df['Crime Type'] = crime
    evaluation_df['Actual_Crimes'] = test_data 
    evaluation_df['MAE'] = mae
    evaluation_df['RMSE'] = rmse
    evaluation_df['MAPE'] = mape

    all_predictions = pd.concat([all_predictions, evaluation_df])
    
    if total_maes:
        total_mae = np.mean(total_maes)
        total_rmse = np.mean(total_rmses)
        total_mape = np.mean(total_mapes)
        print(all_predictions)
        print(f"\nMean Absolute Error (MAE): {total_mae:.2f}")
        print(f"Root Mean Squared Error (RMSE): {total_rmse:.2f}")
        print(f"Mean Absolute Percentage Error (MAPE): {total_mape:.2f}%")
    else:
        print("\nNo models were successfully fitted and evaluated.")

    print(all_predictions)

    

In [11]:
total_mae, total_rmse, total_mape

(1163.8743929705875, 1163.8743929705875, 18.142534393352317)

In [None]:
# prediction for 2025:

predictions_2025 = pd.DataFrame()
for crime in crime_types:
    crime_type_data = total_crimes_by_type[total_crimes_by_type['Crime Type'] == crime].copy()
    
    if 'Year' not in crime_type_data.columns:
        total_crimes_by_type = total_crimes_by_type.reset_index()
    
    crime_type_data['Year'] = pd.to_datetime(crime_type_data['Year'], format='%Y')
    crime_type_data = crime_type_data.set_index('Year')
    crime_type_data.index = crime_type_data.index.astype(int)
    total_crimes_series = crime_type_data['Total_Crimes']
    train_data = total_crimes_series
    
    auto_model = pm.auto_arima(train_data,
                           start_p=1, start_q=1,
                           max_p=12, max_q=12,
                           m=1, 
                           seasonal=False,
                           d=None, 
                           trace=True,
                           error_action='ignore',
                           suppress_warnings=True,
                           stepwise=True,
                           trend = 't')
    best_order = auto_model.order
    
    model = ARIMA(train_data, order=best_order)
    fitted_model = model.fit()
    predictions = fitted_model.forecast(steps=1)

    predictions_df = pd.DataFrame({
            'Crime Type': [crime],
            'Year': [2025],
            'Predicted_Crimes': [predictions.iloc[0]]
        })
    predictions_2025 = pd.concat([predictions_2025, predictions_df], ignore_index=True)
        
predictions_2025


In [13]:
predictions_2025['Predicted_Crimes'] = predictions_2025['Predicted_Crimes'].astype(int)
predictions_2025

Unnamed: 0,Crime Type,Year,Predicted_Crimes
0,ASSAULT,2025,24797
1,AUTOTHEFT,2025,6115
2,BIKETHEFT,2025,2840
3,BREAKENTER,2025,6929
4,HOMICIDE,2025,74
5,ROBBERY,2025,3214
6,SHOOTING,2025,514
7,THEFTFROMMV,2025,8809
8,THEFTOVER,2025,1820


In [14]:
predictions_2025 = predictions_2025.rename(columns={'Predicted_Crimes': 'Total_Crimes'})
predictions_2025['Data_Type'] = 'Forecast'
total_crimes_by_type['Data_Type'] = 'Historical'
combined_df = pd.concat([total_crimes_by_type, predictions_2025], ignore_index=True)
combined_df 


Unnamed: 0.1,Unnamed: 0,Crime Type,Year,Total_Crimes,Data_Type
0,0.0,ASSAULT,2014,16515.0,Historical
1,1.0,ASSAULT,2015,17858.0,Historical
2,2.0,ASSAULT,2016,18608.0,Historical
3,3.0,ASSAULT,2017,18906.0,Historical
4,4.0,ASSAULT,2018,19565.0,Historical
...,...,...,...,...,...
103,,HOMICIDE,2025,74.0,Forecast
104,,ROBBERY,2025,3214.0,Forecast
105,,SHOOTING,2025,514.0,Forecast
106,,THEFTFROMMV,2025,8809.0,Forecast


In [15]:
combined_df.to_csv('crime_total_count_with_forecast_for_2025.csv', index=False)