In [63]:
import pandas as pd
import requests
import json
import datetime
import pickle
import math
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression


# define the current and previous year
current_year = datetime.datetime.now().year
last_year = current_year - 1

# function that calls api and returns the features for a given year
def callAndStore(year):
    url = f'https://services.arcgis.com/afSMGVsC7QlRK1kZ/arcgis/rest/services/Police_Incidents_{year}/FeatureServer/0/query?where=1%3D1&outFields=reportedDateTime,offense,description,centerLong,centerLat&outSR=4326&f=json'
    # make api request
    response = requests.get(url).json()

    features = [x for x in response['features']]

    return features

crime_list=[]

# for all features in the current and prior year.....
for crime in (callAndStore(current_year)+callAndStore(last_year)):
    clean_crime={
        'date':datetime.datetime.fromtimestamp(crime['attributes']['reportedDateTime']/1000).strftime("%m/%d/%Y"),
        'time':datetime.datetime.fromtimestamp(crime['attributes']['reportedDateTime']/1000).strftime("%H:%M:%S"),
        'centerLong': crime['attributes']['centerLong'],
        'centerLat':crime['attributes']['centerLat'],
        'description':crime['attributes']['description'].strip()
        }
    crime_list.append(clean_crime)

crime_severity={
"AUTOMOBILE THEFT": 4,
"THEFT-MOTR VEH PARTS": 2.5,
"OTHER THEFT": 2.5,
"THEFT FROM MOTR VEHC": 2,
"BURGLARY OF DWELLING": 5,
"BURGLARY OF BUSINESS": 4,
"ROBBERY PER AGG": 8,
"ASSLT W/DNGRS WEAPON": 6,
"ROBBERY INCLUDING AUTO THEFT": 4,
"ROBBERY OF PERSON": 4,
"BIKE THEFT": 2,
"CSC - RAPE": 9,
"SHOPLIFTING": 4,
"2ND DEG DOMES ASLT": 4,
"THEFT BY SWINDLE": 3,
"DOMESTIC ASSAULT/STRANGULATION": 4,
"ROBBERY OF BUSINESS": 4,
"ASLT-SGNFCNT BDLY HM": 4,
"ASLT4-LESS THAN SUBST HARM": 4,
"THEFT FROM PERSON SNATCH/GRAB": 4,
"ARSON": 8,
"MURDER (GENERAL)": 11,
"CSC - SODOMY": 9,
"THEFT FROM BUILDING": 4,
"3RD DEG DOMES ASLT": 6,
"CSC - PENETRATE WITH OBJECT": 9,
"ASLT-GREAT BODILY HM": 9,
"OTHER VEHICLE THEFT": 4,
"ASLT4-SUBST HARM OR WEAPON": 6,
"OBS - PETTY THEFT": 2,
"ON-LINE THEFT": 2.5,
"FAIL TO PAY - TAXI/HOTEL/REST": 2.5,
"ARSON-3RD DEGREE": 3,
"OBS-CSCR - USE EXT 1, 2 OR 3": 8,
"POCKET-PICKING": 4,
"LOOTING": 5,
"SCRAPPING-RECYCLING THEFT": 2,
"1ST DEG DOMES ASLT": 4,
"MURDER - 2ND DEGREE": 11,
"HACKING - THEFT OF SERVICE": 3,
"ARSON-1ST DEGREE": 8,
"ACCESS/ALTER SYSTEM/NETWORK": 3,
"ARSON-5TH DEGREE": 3,
"GAS STATION DRIV-OFF": 2.5,
"DO NOT USE": 0
}

In [64]:
incidents = pd.DataFrame(crime_list)

#Set parameters for machine learning algorithm
ClustersList=[50,100,200,250,500]
PriorDaysList=[5,10,14,60,90,120]
todayList=[datetime.date(2020,6,7),datetime.date(2020,11,4),datetime.date(2020,12,5),datetime.date(2020,10,19),datetime.date(2021,1,1),datetime.date(2021,2,11),datetime.date(2021,3,8),datetime.date(2020,7,23)]
ScoreArray=[[0 for x in range(len(PriorDaysList))] for y in range(len(ClustersList))]

for s1,Clusters in enumerate(ClustersList):

    # Initialize and Fit KMeans Model
    clusterer = KMeans(n_clusters=Clusters,random_state=42).fit(incidents[["centerLong","centerLat"]])

    # Run Predictions
    predictions = clusterer.predict(incidents[["centerLong","centerLat"]])

    # Add column for clusters to incidents dataframe
    incidents["cluster"] = predictions

    # Save Model using Pickle
    # pickle.dump(clusterer, open("../models/clusterer.pkl", "wb"))

    
    for s2,PriorDays in enumerate(PriorDaysList):
        # today=datetime.date.today()
        for today in todayList:
            InitDay=today-datetime.timedelta(days=PriorDays)

            #This assigns a danger value to each cluster that is not normalized
            Cluster_Danger=[[0 for x in range(Clusters)] for y in range(PriorDays)]
            #Only used for testing
            Today_Danger=[0 for x in range(Clusters)]

            for crime in crime_list:
                MDY = [int(x) for x in crime["date"].split("/")]
                date = datetime.date(MDY[2],MDY[0],MDY[1])
                if date == InitDay:
                    try:
                        Cluster_Danger[0][clusterer.predict([[crime["centerLong"],crime["centerLat"]]])[0]]+=crime_severity[crime["description"]]
                    except KeyError:
                        print("An error occured on the keys")
                        print(crime["description"])
                        print("")
                elif date > InitDay and date < today:
                    num=int(str(date-InitDay).split(",")[0].split()[0])
                    try:
                        Cluster_Danger[num][clusterer.predict([[crime["centerLong"],crime["centerLat"]]])[0]]+=crime_severity[crime["description"]]
                    except KeyError:
                        print("An error occured on the keys")
                        print(crime["description"])
                        print("")
            #Only used for testing
                elif date == today:
                    try:
                        Today_Danger[clusterer.predict([[crime["centerLong"],crime["centerLat"]]])[0]]+=crime_severity[crime["description"]]
                    except KeyError:
                        print("An error occured on the keys")
                        print(crime["description"])
                        print("")


            MaxDanger=0
            for day in Cluster_Danger:
                if MaxDanger<max(day):
                    MaxDanger=max(day)
            #This creates a normalized danger value for each cluster between 0 and 10
            Normal_Cluster_Danger=[[] for y in range(PriorDays)]

            for day in range (PriorDays):
                for cluster in Cluster_Danger[day]:
                    Normal_Cluster_Danger[day].append(cluster/MaxDanger*10)

            #Only used for testing
            Normal_Today_Danger=[]
            for cluster in Today_Danger:
                Normal_Today_Danger.append(math.ceil(cluster/MaxDanger*10))

            Training_Data=[]
            for d,day in enumerate(Normal_Cluster_Danger):
                for c,cluster in enumerate(day):
                    Training_Data.append({
                        "Day": d,
                        "Cluster": c,
                        "Danger": cluster
                    })
            Training=pd.DataFrame(Training_Data)


            #Only used for testing
            Testing_Data=[]
            for c,cluster in enumerate(Normal_Today_Danger):
                Testing_Data.append({
                    "Day": PriorDays,
                    "Cluster": c,
                    "Danger": cluster
                })
            Testing=pd.DataFrame(Testing_Data)

            #Only used for testing
            X_test = Testing[["Day", "Cluster"]].values
            y_test = Testing["Danger"].values.reshape(-1, 1)

            Predictions=[]
            for cluster in range(Clusters):
                CurrentTraining=Training.loc[Training['Cluster']==cluster]
                # print(CurrentTraining.head())
                #Setting up X and y to train our linear model
                X_train = CurrentTraining["Day"].values.reshape(-1, 1)
                y_train = CurrentTraining["Danger"].values.reshape(-1, 1)

                #Create the model
                model = LinearRegression()

                #Fit the model to the training data. 
                model.fit(X_train, y_train)

                # Use our model to predict a value
                predicted = model.predict([[PriorDays]])
                Predictions.append(min(max(math.ceil(predicted[0][0]),0),10))
            Predictions=np.array(Predictions).reshape(-1,1)
            SUM=0
            for i,y in enumerate(Predictions):
                SUM+=(y-y_test[i])**2
            ScoreArray[s1][s2]+=math.sqrt(SUM/Clusters)

In [65]:
# del Cluster_Danger, MaxDanger, clean_crime, crime, crime_list, crime_severity, date, current_year, incidents, last_year, num, predictions, MDY, cluster, day
# del Training_Data, Normal_Cluster_Danger, d, day, c, cluster

In [66]:
for s1,Clusters in enumerate(ClustersList):
    for s2, PriorDays in enumerate(PriorDaysList):
        print(f"Using {Clusters} clusters and {PriorDays} prior days, the root mean square score is  ")
        print (ScoreArray[s1][s2])
        print("")


Using 50 clusters and 5 prior days, the root mean square score is  
17.973893697838978

Using 50 clusters and 10 prior days, the root mean square score is  
14.423689795102796

Using 50 clusters and 14 prior days, the root mean square score is  
14.193223401872599

Using 50 clusters and 60 prior days, the root mean square score is  
10.24658178917826

Using 50 clusters and 90 prior days, the root mean square score is  
9.327670747044468

Using 50 clusters and 120 prior days, the root mean square score is  
8.509226846156874

Using 100 clusters and 5 prior days, the root mean square score is  
16.017943081106825

Using 100 clusters and 10 prior days, the root mean square score is  
13.105477024144514

Using 100 clusters and 14 prior days, the root mean square score is  
12.769608301677316

Using 100 clusters and 60 prior days, the root mean square score is  
9.487590544597602

Using 100 clusters and 90 prior days, the root mean square score is  
9.499808940729768

Using 100 clusters and