## Crime in LA

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
import statsmodels.api as sm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

## Data Cleaning

In [None]:
crime_data = pd.read_csv('Crime_Data_from_2020_to_Present.csv') #load in data

crime_data = crime_data.drop(columns=['Vict Descent','Status','Status Desc']) #drop irrelevant columns

crime_data.dropna(inplace=True) #drop nan data and remove unwanted columns

#make sure there is a valid location listed
crime_data = crime_data[crime_data['LON'] != 0]
crime_data = crime_data[crime_data['LAT'] != 0]

# DROP CRIMES WITH ONLY A FEW INCIDENTS
X = crime_data.drop(columns=['Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'Crm Cd']) #drop all extra crime codes
y = crime_data['Crm Cd']   #crime code column

num_incidents = 1000  #minimum number of incidents required to keep crime
crimes = []
for i in set(y):
    if len(crime_data[crime_data['Crm Cd']==i])>num_incidents:
        crimes.append(i)

#remove other crime codes in data
crime_data = crime_data[crime_data['Crm Cd'].isin(crimes)]

df = pd.get_dummies(crime_data, columns = ['TIME OCC', 'LOCATION', 'Cross Street', 'Vict Sex',]) #one hot encode categorical data
df['Vict Sex'] = df['Vict Sex_H'] + 2*df['Vict Sex_M'] + 3*df['Vict Sex_X'] #recombine victom sex columns

#convert time indices
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'], format='%m/%d/%Y %I:%M:%S %p')
df['year'] = df['DATE OCC'].dt.year
df['month'] = df['DATE OCC'].dt.month
df['day'] = df['DATE OCC'].dt.day
df['hour'] = df['DATE OCC'].dt.hour
df['day_of_week'] = df['DATE OCC'].dt.dayofweek

#check types of each column and drop columns with unwanted type
columns_to_drop = []
for i in df.columns:
    
    if df[i].dtype!='float64' and df[i].dtype!= 'uint8' and df[i].dtype != 'int64':
        columns_to_drop.append(i)
        
    #drop other crime code columns
    if re.search('Crm Cd*', i):
        columns_to_drop.append(i)
        
#drop other unwanted columns
columns_to_drop.append('DR_NO')
columns_to_drop.append('Part 1-2')
new_df = df.drop(columns = columns_to_drop)   #drop columns

## Kmeans Class

In [None]:
#load real data
station_loc = pd.read_csv('Sheriff_and_Police_Stations.csv')

#get data for Los Angeles
station_loc1 = station_loc[station_loc['city']=='Los Angeles']   
real_loc = station_loc1[['latitude','longitude']]

class KMeans:
    """Basic k-means clustering class."""
    def __init__(self, n_clusters=8, max_iter=100, tol=1e-5, normalize=False, p=2):
        """Store clustering algorithm parameters.
        
        Parameters:
            n_clusters (int): How many clusters to compute.
            max_iter (int): The maximum number of iterations to compute.
            tol (float): The convergence tolerance.
        """
        self.n_clusters = n_clusters #initialize everything
        self.max_iter = max_iter
        self.tol = tol
        self.p = p
        self.normalize = normalize
    
    def fit(self, X, y=None):
        """Compute the cluster centers from random initial conditions.
        
        Parameters:
            X ((n_samples, n_classes) ndarray): the data to be clustered.
        """
        
        #set our centers and then normalize if we need to 
        self.centers = X[np.random.choice(X.shape[0],self.n_clusters,replace=False)]
        
        if self.normalize == True:
            self.centers = np.reshape(self.centers/np.linalg.norm(self.centers,axis=1),(-1,1))
            
        #iterate thorugh max iter and create the label and new center
        for i in range(0,self.max_iter): 
            
                label = np.argmin(np.linalg.norm(X[:,np.newaxis]-self.centers,ord=self.p,axis=2),axis=1)
                new_c = np.array([X[label==z].mean(axis=0) for z in range(self.n_clusters)])
                
                if np.linalg.norm(new_c-self.centers,ord=self.p) <self.tol: #if error is less than tol break
                     break
                        
                self.centers = new_c #set the new center and normalize 
                if self.normalize == True:
                    
                    self.centers = np.reshape(self.centers/np.linalg.norm(self.centers,axis=1),(-1,1))
                    
        return self
    
    def predict(self, X):
        """Classify each entry of X based on which cluster center it belongs to.

        Parameters:
            X ((n_samples, n_classes) ndarray): the data to be clustered.
        
        Returns:
            ((n_samples) ndarray): Integer labels from 0 to n_clusters for each entry of X.
        """
        
        #make our distance and return the argmin of it
        my_dist = np.linalg.norm(X[:,np.newaxis]-self.centers,ord=self.p,axis=2)
        
        return np.argmin(my_dist,axis=1)
    
    def fit_predict(self, X):
        """Fit to the data and return the resulting labels.

        Parameters:
            X ((n_samples, n_classes) ndarray): the data to be clustered.
        """
        return self.fit(X).predict(X)
    
    def plot(self, X, y):
        """Plot each of the data points, colored by cluster.
        Plot each of the cluster centers using a different marker.

        Parameters:
            X ((n_samples, n_classes) ndarray): the data being clustered.
            y ((n_samples) ndarray): the labels for each of the samples.
        """
        
        #plot the points and then the centers
        plt.scatter(X[:,0],X[:,1],c=y)
        plt.scatter(self.centers[:,0],self.centers[:,1],marker='+',color='black',label='Optimal Location')
        plt.scatter(real_loc['longitude'],real_loc['latitude'],marker='+',color='red',label='Real Location')
        
        #initialize plot attributes
        plt.legend()
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')
        plt.title('Police Station Locations in LA County')
        plt.show()

#run kmeans algorithm on our data
location = new_df[['LON','LAT']]   #locations from our data and drop 0s from latitude and longitude
new_loc = location[~(location == 0).all(axis=1)]
new_data = new_loc[['LON','LAT']].values

for x in [1,2,np.inf]: #iterate through our different norms 
    km = KMeans(n_clusters=17,p=x) #initialize our class, fit it and then predict and plot it 
    km.fit(new_data)
    y = km.predict(new_data)
    km.plot(new_data,y)

## Random Forest Model

In [None]:
#Define your training and test data
X = new_df
le = LabelEncoder()
y = le.fit_transform(df['Crm Cd'])
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size = 0.3)

#define a Parameter Grid
rf = RandomForestClassifier()
param_grid = {'n_estimators': [25,50,100], "criterion": ['gini','entropy'], "max_features": 
              [None,'sqrt','log2'], 'max_depth': [5,10]}

#Perform a Grid search with 3-fold cross validation
rf_gs = GridSearchCV(rf,param_grid,cv = 3, n_jobs = -1)
rf_gs.fit(X_train, y_train)

#display the best parameters and your score
print(f'Best Parameters: {rf_gs.best_params_}')
print(f'Best Score: {rf_gs.best_score_}')

#DEFINE RANDOM FOREST WITH OPTIMAL PARAMETERS
#provide labels to each crime code from the cleaned data
le = LabelEncoder()
y = le.fit_transform(df['Crm Cd'])
rf = RandomForestClassifier(n_estimators=100,criterion='gini', max_depth=10, max_features=None)
rf.fit(X_train, y_train)

#get your predictions, accuracy and f1 score
y_pred = rf.predict(X_test)
accuracy_s = accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred, average = 'micro')

#get the Optimal Features based off of feature importance
sel = SelectFromModel(RandomForestClassifier(criterion = 'gini', max_depth = 10, n_estimators = 100))
sel.fit(X_train, y_train)

#select the features to retrain the model on
X_selected = sel.fit_transform(X_train, y_train)
X_selected = X_selected.astype(int)
sel.get_support()
selected_feat = X_train.columns[(sel.get_support())]

X = X_train[selected_feat]   #use selected features

#construct the randomforest classifier
rf = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=10, max_features=None) 
rf.fit(X, y_train)

#get predictions and accuracy
y_pred = rf.predict(X_test[selected_feat])
accuracy = accuracy_score(y_test, y_pred)

#display the classification report for each crime code
label_names = list(map(str, df['Crm Cd'].unique()))
print(classification_report(y_test, y_pred, target_names=label_names, zero_division=0.0))

## Gradient Boosted Model

In [None]:
le = LabelEncoder()

#Construct your crime data from nominal data
y = le.fit_transform(df['Crm Cd'])
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size = 0.3)

#initialize classifier and set parameters
rf = GradientBoostingClassifier()
param_grid = {'n_estimators': [25,50,100], "loss": ['log_loss','exponential'], "max_features": [None,'sqrt','log2'], 'max_depth': [5,10], 'min_samples_leaf': [1,4,8]}

#run the grid search
rf_gs = GridSearchCV(rf,param_grid,cv = 3, n_jobs = -1)
rf_gs.fit(X_train, y_train)

#display the best parameters and your score
print(f'Best Parameters: {rf_gs.best_params_}')
print(f'Best Score: {rf_gs.best_score_}')

#DEFINE GRADIENT BOOSTED FOREST WITH OPTIMAL PARAMETERS
#provide labels to each crime code from the cleaned data
le = LabelEncoder()
y = le.fit_transform(df['Crm Cd'])
rf = RandomForestClassifier(n_estimators=100,criterion='gini', max_depth=10, max_features=None)
rf.fit(X_train, y_train)

#get your predictions, accuracy and f1 score
y_pred = rf.predict(X_test)
accuracy_s = accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred, average = 'micro')

#get the Optimal Features based off of feature importance
sel = SelectFromModel(RandomForestClassifier(criterion = 'gini', max_depth = 10, n_estimators = 100))
sel.fit(X_train, y_train)

#select the features to retrain the model on
X_selected = sel.fit_transform(X_train, y_train)
X_selected = X_selected.astype(int)
sel.get_support()
selected_feat = X_train.columns[(sel.get_support())]

X = X_train[selected_feat]   #use selected features

#build classifier
rf = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=10, max_features=None) 
rf.fit(X, y_train)

#get predictions and accuracy
y_pred = rf.predict(X_test[selected_feat])
accuracy = accuracy_score(y_test, y_pred)

#build a classification report
label_names = list(map(str, df['Crm Cd'].unique()))
print(classification_report(y_test, y_pred, target_names=label_names, zero_division=0.0))

## Multiclass Logistic Regression Classifiers

In [None]:
le = LabelEncoder()

#get crime data X and labels y
y = le.fit_transform(crime_data['Crm Cd'])

#split test data and initialize model
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=3)

#initialize logistic regression model
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, max_iter=500)   

#train and run test data
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

#get accuracy score
accuracy = logreg.score(X_test, y_test)
print(f'Accuracy of the Logistic Regression Model: {accuracy}')

#perform PCA
pca=PCA(n_components=2)
X_test=pca.fit_transform(X_test)

#display classification with the two best principal components
plt.scatter(X_test[:,0], X_test[:,1], marker='.', c=y_test)   #plot
plt.show()