In [39]:
import csv
import pandas as pd 
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans

crime_data_dict = []

def transform_to_dict(row):
    dic = {'district': row['district'], 'offense_type': row['offense_type']}
    crime_data_dict.append(dic)

def cluster(tod, dow):
    # transforming filtered dataframe to dictionary for vectorization
    for ind, row in df.iterrows():
        if ((row['tod'] == tod) and (row['day_of_week'] == dow)):
            transform_to_dict(row)
            
    vectorizer = DictVectorizer()
    crime_data = vectorizer.fit_transform(crime_data_dict).toarray()

    # getting k-means
    k = 12
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    kmeans.fit_predict(crime_data)
    k_labels = kmeans.labels_

    # getting top terms per cluster for kmeans
    asc_order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1]
    order_centroids = asc_order_centroids[:,::-1]
    terms = vectorizer.get_feature_names()

    print("On {} at {}: Clusters labeled by top category for Kmeans".format(dow, tod))
    k_top_crime_district = {} # keeps track of popular crime and district by cluster
    districts = [] # keeps track of unique district
    
    for i in range(k):
        get_offense = False
        get_district = False
        district = ''
        offense = ''
        for ind in order_centroids[i, :10]:
            # get top unique district in cluster 
            if not (get_district):
                if terms[ind].split('=')[0] == 'district':
                    temp = terms[ind].split('=')[1]
                    if (temp not in districts):
                        districts.append(temp)
                        district = temp
                        get_district = True
                        
            # get top crime in cluster        
            if not (get_offense):
                if (terms[ind].split('=')[0] == 'offense_type'):
                    offense = terms[ind].split('=')[1]
                    get_offense = True

            # if both offense and district have been found add to dictionary
            if (get_offense == True and get_district == True):
                k_top_crime_district[district] = offense
                break

    print(k_top_crime_district)
    print()
    
if __name__ == "__main__":
    # reading in csv to create dataframe
    df = pd.read_csv('cleaned_crime_data.csv')
    cluster('Morning', 'Monday')
    cluster('Afternoon', 'Monday')
    cluster('Evening', 'Monday')
    cluster('Night', 'Monday')
    cluster('Morning', 'Tuesday')
    cluster('Afternoon', 'Tuesday')
    cluster('Evening', 'Tuesday')
    cluster('Night', 'Tuesday')
    cluster('Morning', 'Wednesday')
    cluster('Afternoon', 'Wednesday')
    cluster('Evening', 'Wednesday')
    cluster('Night', 'Wednesday')
    cluster('Morning', 'Thursday')
    cluster('Afternoon', 'Thursday')
    cluster('Evening', 'Thursday')
    cluster('Night', 'Thursday')
    cluster('Morning', 'Friday')
    cluster('Afternoon', 'Friday')
    cluster('Evening', 'Friday')
    cluster('Night', 'Friday')
    cluster('Morning', 'Saturday')
    cluster('Afternoon', 'Saturday')
    cluster('Evening', 'Saturday')
    cluster('Night', 'Saturday')
    cluster('Morning', 'Sunday')
    cluster('Afternoon', 'Sunday')
    cluster('Evening', 'Sunday')
    cluster('Night', 'Sunday')

On Monday at Morning: Clusters labeled by top category for Kmeans
{'C6': 'Larceny', 'E18': 'Possession', 'D4': 'Larceny', 'A7': 'Breaking and Entering', 'B2': 'Assault/Battery', 'A1': 'Assault/Battery', 'C11': 'Harassment', 'D14': 'Assault/Battery', 'B3': 'Burglary'}

On Monday at Afternoon: Clusters labeled by top category for Kmeans
{'C6': 'Assault/Battery', 'E13': 'Possession', 'D4': 'Burglary', 'B2': 'Larceny', 'A1': 'Larceny', 'E5': 'Harassment', 'C11': 'Breaking and Entering', 'D14': 'Larceny', 'B3': 'Assault/Battery'}

On Monday at Evening: Clusters labeled by top category for Kmeans
{'C6': 'Larceny', 'E13': 'Harassment', 'E18': 'Larceny', 'D4': 'Breaking and Entering', 'B2': 'Assault/Battery', 'A1': 'Larceny', 'E5': 'Larceny', 'C11': 'Possession', 'D14': 'Larceny', 'B3': 'Burglary'}

On Monday at Night: Clusters labeled by top category for Kmeans
{'C6': 'Robbery', 'E13': 'Possession', 'E18': 'Assault/Battery', 'D4': 'Burglary', 'A7': 'Assault/Battery', 'B2': 'Larceny', 'A1': 'L