### Importing Libraries

In [1]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

### Importing DataFrames

In [2]:
df = pd.read_csv('./data/test.csv')
df_full = pd.read_csv('./cleaned_data/full.csv')

### Defining X variable

In [3]:
X = df.drop(columns = ['Unique ID', 'Race', 'description'])

### Scaling predictor vairables

In [4]:
sc = StandardScaler()
X_sc = sc.fit_transform(X)

### Instatiating and fitting KMeans over scaled predictor variables

In [5]:
km = KMeans(n_clusters=6, random_state=42)
km.fit(X_sc)

KMeans(n_clusters=6, random_state=42)

### Retrieving KMeans predictions

In [6]:
km.predict(X_sc)

array([4, 0, 3, ..., 0, 1, 1], dtype=int32)

### Attaching cluster predictions column to main dataframe

In [7]:
df['cluster'] = km.labels_
df.head()

Unnamed: 0,Unique ID,Age,year,month,week_of_year,day_of_month,day_of_week,day_of_year,pop2000,pop2010,...,"Foreknowledge of mental illness? INTERNAL USE, NOT FOR ANALYSIS_No","Foreknowledge of mental illness? INTERNAL USE, NOT FOR ANALYSIS_Unknown","Foreknowledge of mental illness? INTERNAL USE, NOT FOR ANALYSIS_Yes",Agency_multiple_agencies,Agency_other,Agency_police,Agency_sheriff,description,Race,cluster
0,2.0,53.0,2000.0,1.0,1.0,2.0,6.0,2.0,665865.0,691893,...,1,0,0,0,0,0,1,"Darren Mayfield, a DeKalb County sheriff's dep...",Race unspecified,4
1,3.0,23.0,2000.0,1.0,1.0,5.0,2.0,5.0,39678.0,38950,...,1,0,0,0,0,1,0,Officer Elias E. Mendiola shot Derrick E. Tate...,Race unspecified,0
2,7.0,23.0,2000.0,1.0,1.0,6.0,3.0,6.0,695454.0,919628,...,1,0,0,1,0,0,0,State troopers and county deputies had tracked...,Race unspecified,3
3,8.0,31.0,2000.0,1.0,1.0,6.0,3.0,6.0,1709434.0,2035210,...,0,1,0,1,0,0,0,SWAT officers shot and killed Adrian Dolby sho...,Race unspecified,5
4,10.0,28.0,2000.0,1.0,1.0,7.0,4.0,7.0,9519338.0,9818605,...,0,1,0,0,0,1,0,Joseph Gumpert stopped breathing after a scuff...,Race unspecified,2


### Creating dataframe containing Unique ID and cluster predictions

In [8]:
df_cluster = df[['Unique ID', 'cluster']]

### Examining new dataframe

In [9]:
df_cluster.head()

Unnamed: 0,Unique ID,cluster
0,2.0,4
1,3.0,0
2,7.0,3
3,8.0,5
4,10.0,2


### Exporting KMeans cluster predictions as a CSV file.

In [10]:
df_cluster.to_csv('./data/cluster.csv')

### Examining absolute and normalized value counts for KMeans clusters

In [11]:
df['cluster'].value_counts()

0    2034
3    1651
4    1470
5     764
1     333
2     127
Name: cluster, dtype: int64

In [12]:
df['cluster'].value_counts(normalize = True)

0    0.318859
3    0.258818
4    0.230444
5    0.119768
1    0.052203
2    0.019909
Name: cluster, dtype: float64

### Checking silhouette scoe

In [13]:
silhouette_score(X_sc, km.labels_)

0.09735316777185415

## DBSCAN

### Instantiating DBSCAN unsupervised learning model

Increasing the minimum samples decreases the number of total unique categories

In [14]:
dbscan = DBSCAN(eps=4, min_samples=16)

dbscan.fit(X_sc)

DBSCAN(eps=4, min_samples=16)

In [15]:
df_dbscan = pd.DataFrame(dbscan.labels_)

In [16]:
df_dbscan

Unnamed: 0,0
0,-1
1,-1
2,-1
3,-1
4,-1
...,...
6374,-1
6375,-1
6376,-1
6377,-1


### Adjusting hyperparameters created the correct amount of outcomes

In [17]:
df_dbscan[0].value_counts(normalize = True)

-1    0.980875
 0    0.006427
 1    0.004233
 4    0.003292
 3    0.002665
 2    0.002508
Name: 0, dtype: float64

### Value counts of outcomes are extremely skewed, suggesting almost no efficacy in model

In [18]:
df_dbscan[0].value_counts()

-1    6257
 0      41
 1      27
 4      21
 3      17
 2      16
Name: 0, dtype: int64