In [37]:
import csv
import pandas as pd 
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans

crime_data_dict = []

def transform_to_dict(row):
    dic = {'district': row['district'], 'offense_type': row['offense_type']}
    crime_data_dict.append(dic)

    
if __name__ == "__main__":
    # reading in csv to create dataframe
    df = pd.read_csv('cleaned_crime_data.csv')
    
    tod = 'Night'
    # transforming filtered dataframe to dictionary for vectorization
    for ind, row in df.iterrows():
        if row['tod'] == tod:
            transform_to_dict(row)
            
    vectorizer = DictVectorizer()
    crime_data = vectorizer.fit_transform(crime_data_dict).toarray()

    # getting k-means
    k = 12
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    kmeans.fit_predict(crime_data)
    k_labels = kmeans.labels_

    # getting top terms per cluster for kmeans
    asc_order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1]
    order_centroids = asc_order_centroids[:,::-1]
    terms = vectorizer.get_feature_names()

    print("Clusters labeled by top category for Kmeans in the format: [c0, c1, c2,..., c13, c14, c15] where c = cluster" + "\n")
    k_top_cat_cluster = []
    
    print("Top terms per cluster:")    
    for i in range(k):
        print("Cluster {}:".format(i))
        for ind in order_centroids[i, :10]:
            print(' {}'.format(terms[ind].split('=')[1]))
            if (terms[ind].split('=')[0] == 'offense_type'):
                k_top_cat_cluster.append(terms[ind].split('=')[1])
                break
        print('')
        
print(k_top_cat_cluster)

Clusters labeled by top category for Kmeans in the format: [c0, c1, c2,..., c13, c14, c15] where c = cluster

Top terms per cluster:
Cluster 0:
 Assault/Battery

Cluster 1:
 Larceny

Cluster 2:
 B2
 Assault/Battery

Cluster 3:
 Robbery

Cluster 4:
 A1
 Assault/Battery

Cluster 5:
 Burglary

Cluster 6:
 D4
 Assault/Battery

Cluster 7:
 E18
 Assault/Battery

Cluster 8:
 D4
 Larceny

Cluster 9:
 B3
 Assault/Battery

Cluster 10:
 Assault/Battery

Cluster 11:
 Larceny

['Assault/Battery', 'Larceny', 'Assault/Battery', 'Robbery', 'Assault/Battery', 'Burglary', 'Assault/Battery', 'Assault/Battery', 'Larceny', 'Assault/Battery', 'Assault/Battery', 'Larceny']


In [2]:
df

Unnamed: 0,district,location,offense_type,tod
0,B3,"(42.29474312, -71.08503786)",Breaking and Entering,Evening
1,A1,"(42.36339791, -71.05600208)",Burglary,Evening
2,C11,"(42.29794304, -71.05175837)",Possession,Morning
3,D4,"(42.34860972, -71.07538812)",Burglary,Afternoon
4,C6,"(42.32971183, -71.05268388)",Assault/Battery,Morning
5,E13,"(42.32387703, -71.10409269)",Assault/Battery,Night
6,C11,"(42.30870401, -71.06844484)",Larceny,Morning
7,C6,"(42.33339546, -71.05179708)",Larceny,Morning
8,D4,"(42.3425037, -71.10178609)",Assault/Battery,Afternoon
9,C11,"(42.31040916, -71.06239921)",Burglary,Evening
