In [11]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import datetime
import random
import seaborn as sns
import plotly.express as px

from collections import defaultdict
from scipy.stats import pearsonr

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist, squareform

In [12]:
df = pd.read_csv('df_scaled.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197914 entries, 0 to 197913
Data columns (total 40 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   povertyPercentage               197914 non-null  float64
 1   males_percentage                197914 non-null  float64
 2   females_percentage              197914 non-null  float64
 3   child_percentage                197914 non-null  float64
 4   teen_percentage                 197914 non-null  float64
 5   adult_percentage                197914 non-null  float64
 6   killed_percentage               197914 non-null  float64
 7   injured_percentage              197914 non-null  float64
 8   killed_percentage_per_incident  197914 non-null  float64
 9   unharmed_percentage             197914 non-null  float64
 10  winning_party_percentage        197914 non-null  float64
 11  arrest_percentage               197914 non-null  float64
 12  killed_rate_stat

Selecting one state as stated in subtask of clustering task.

In [13]:
df['state'].value_counts() #to find state with most crimes

illinois          15935
california        13842
florida           12661
texas             11023
new york           8707
ohio               8601
georgia            7634
pennsylvania       7529
north carolina     7254
louisiana          6670
tennessee          6282
missouri           6054
south carolina     5886
massachusetts      5274
michigan           5004
virginia           4845
indiana            4828
new jersey         4693
maryland           4477
alabama            4388
wisconsin          4266
kentucky           3456
oklahoma           3069
mississippi        2990
washington         2900
colorado           2831
connecticut        2731
arkansas           2500
iowa               2333
oregon             2000
arizona            1984
kansas             1949
minnesota          1715
nevada             1589
new mexico         1479
nebraska           1318
west virginia      1285
alaska             1276
utah                916
new hampshire       858
maine               844
rhode island    

In [14]:
df = df.loc[df['state'] == 'illinois']

To choose the best eps value we create the distance matrix 

In [15]:
column_to_drop = ['date', 'latitude', 'longitude','congressional_district', 'candidatevotes', 'totalvotes', 'total_votes_for_state']
for col in df.columns:
    if df[col].dtype == 'object':
       column_to_drop.append(col)
column_to_drop

['date',
 'latitude',
 'longitude',
 'congressional_district',
 'candidatevotes',
 'totalvotes',
 'total_votes_for_state',
 'date',
 'state',
 'city_or_county',
 'incident_characteristics1',
 'party']

In [16]:
df_categorical = df[column_to_drop]
df_numeric = df.drop(columns=column_to_drop)

In [17]:
scaler = MinMaxScaler()
X = scaler.fit_transform(df_numeric.values)
scaled_df = pd.DataFrame(X, columns=df_numeric.columns)
Y = scaler.inverse_transform(X)
scaled_df = pd.DataFrame(Y, columns=df_numeric.columns)

In [18]:
X_illinois = scaler.fit_transform(df_numeric.values)

In [19]:
dist = pdist(X, 'euclidean') #pair wise distance
dist = squareform(dist) #distance matrix given the vector dist

In [20]:
#df_numeric['state'] = df['state']
df_numeric_illinois = df[df['state'] == 'illinois']

#TODO: model selection

In [21]:
dbscan = DBSCAN(eps=0.15, min_samples=10)
dbscan.fit(X_illinois)

DBSCAN(eps=0.15, min_samples=10)

Observing the size of each cluster

In [22]:
np.unique(dbscan.labels_, return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22]),
 array([14747,    58,    16,    38,   291,    17,    10,    11,   294,
           44,    15,    14,    11,    10,    10,    10,    10,   100,
           13,    10,    51,   129,    17,     9]))

In [23]:
print('Silhouette %s' % silhouette_score(X_illinois, dbscan.labels_))

Silhouette -0.4151247855392586


In [24]:
k = 3
kth_distances = list()
for d in dist:
    index_kth_distance = np.argsort(d)[k]
    kth_distances.append(d[index_kth_distance])