# Assignment 2

* [link to visualizations](https://s160159.github.io) 

In [1]:
# Used libraries
import json
import numpy as np
import pandas as pd
from __future__ import division
import matplotlib.pyplot as plt
import collections 
from sklearn.cluster import KMeans

## Assignment 2: Data manipulation

In [2]:
# Load data
SF_df = pd.read_csv("../Week3/SFPD2.csv")

# Latitude and longtitude values center SF
y_max_lat = 37.8096707013
y_min_lat = 37.7080034569
x_min_lon = -122.513642064
x_max_lon = -122.365565425

# Remove obs. which are outside SF
valid_idx = [ii for ii in range(0,SF_df.X.size) if SF_df.get_value(ii,"X") > x_min_lon and
                                         SF_df.get_value(ii,"X") < x_max_lon and
                                         SF_df.get_value(ii,"Y") > y_min_lat and
                                         SF_df.get_value(ii,"Y") < y_max_lat]
# Use valid index
SF_df = SF_df.loc[valid_idx]

SF_df_range = range(0,len(valid_idx))
SF_df_N = len(SF_df_range)

In [3]:
# Data manipulation 

# Getting hour
hour = SF_df["Time"].apply(lambda x: x.split(':'))
SF_df["Hour"] = hour.apply(lambda x: int(x[0]))
SF_df["Time_dec"] = hour.apply(lambda x: float(int(x[0]) + int(x[1])/60) )

# Endocing DayOfWeek
day_dict = {'Monday': 1,'Tuesday': 2,'Wednesday': 3,'Thursday':4,
           'Friday': 5,'Saturday': 6,'Sunday': 7}
SF_df["DayOfWeek_int"] = SF_df["DayOfWeek"].apply(lambda x: day_dict[x])

# Getting year
year = SF_df["Date"].apply(lambda x: x.split('/'))
SF_df["Year"] = year.apply(lambda x: int(x[2]))

In [4]:
# Find indices for the two focus crimes
idx_focus_crimes = {}
focus_crimes = ['PROSTITUTION', 'VEHICLE THEFT']

# Indeices for each focus crime
for jj in range(0,len(focus_crimes)):
    idx_focus_crimes[focus_crimes[jj]] = list(SF_df.Category[SF_df.Category == focus_crimes[jj]].index)
    

# Assignment 2A
SF_df_pro_veh = SF_df.loc[idx_focus_crimes['PROSTITUTION'] + idx_focus_crimes['VEHICLE THEFT']]

# Assignment 2B 
SF_df_pro = SF_df.loc[idx_focus_crimes['PROSTITUTION']]


## Assignment 2A: One scatter plot and two datasets

In [34]:
unique_pdist = list(set(SF_df_pro_veh["PdDistrict"]))

#colors for every district
colorDistrict = ["pink", 
        "rgba(38,41,74,1)", 
        "rgba(1,84,90,1)", 
        "rgba(1,115,81,1)", 
        "rgba(3,195,131,1)", 
        "rgba(170,217,98,1)", 
        "rgba(251,191,69,1)", 
        "rgba(239,106,50,1)", 
        "rgba(237,3,69,1)", 
        "rgba(161,42,94,1)"]

# test = ["blue", 
#         "black", 
#         "red", 
#         "green", 
#         "pink", 
#         "yellow", 
#         "purple", 
#         "orange", 
#         "grey",
#         "brown"]

#dictionary with colors for every district
districtColorDic = {}
for xx in range(0,len(unique_pdist)):
    districtColorDic[unique_pdist[xx]] = colorDistrict[xx]

In [35]:
colorDistrict

['pink',
 'rgba(38,41,74,1)',
 'rgba(1,84,90,1)',
 'rgba(1,115,81,1)',
 'rgba(3,195,131,1)',
 'rgba(170,217,98,1)',
 'rgba(251,191,69,1)',
 'rgba(239,106,50,1)',
 'rgba(237,3,69,1)',
 'rgba(161,42,94,1)']

In [36]:
unique_pdist

['CENTRAL',
 'NORTHERN',
 'INGLESIDE',
 'SOUTHERN',
 'MISSION',
 'TENDERLOIN',
 'RICHMOND',
 'TARAVAL',
 'BAYVIEW',
 'PARK']

In [28]:
# Data function for 2A
def myFun2A(year):
    
    # output
    out_frame = []
    # unique 
    unique_pdist = list(set(SF_df_pro_veh["PdDistrict"]))
    N_tot = SF_df_pro_veh[SF_df_pro_veh["Year"] == year].shape[0]  


    for ii in range(0, len(unique_pdist)):
        pdist = unique_pdist[ii]

        tmp = SF_df_pro_veh[(SF_df_pro_veh["PdDistrict"] == pdist) & 
                            (SF_df_pro_veh["Year"] == year)]
                  
        N_sub = tmp.shape[0]            
        # PROSTITUTION X 
        # VEHICLE THEFT Y
        out_frame.append({'X': tmp[tmp["Category"] == 'PROSTITUTION'].shape[0],
                          'Y': tmp[tmp["Category"] == 'VEHICLE THEFT'].shape[0],
                          'tot': N_tot,
                          'sub': N_sub,
                          'r': 1,
                          'value': pdist,
                          'color': districtColorDic[pdist]})

    for ii in range(0, len(unique_pdist)):
        out_frame[ii]['r'] = out_frame[ii]['sub'] / out_frame[ii]['tot'] * 105 # 
        
    return(out_frame)

In [29]:
# Year 2014, 2015, 2016
assignment2BJSON_2014 = myFun2A(2014)
assignment2BJSON_2015 = myFun2A(2015)
assignment2BJSON_2016 = myFun2A(2016)

In [30]:
# Create JSON function 
def createJSON(my_dict, file_name):
    with open(file_name, 'w') as outfile:
        json.dump(my_dict, outfile)
        outfile.close()
        
createJSON(assignment2BJSON_2014, 'data/assignment2BJSON_2014.json')
createJSON(assignment2BJSON_2015, 'data/assignment2BJSON_2015.json')
createJSON(assignment2BJSON_2016, 'data/assignment2BJSON_2016.json')

## Assignment 2B: Visualizing geodata

In [31]:
# Create coordinate list for clustering
tmp_X = list(SF_df_pro.X)
tmp_Y = list(SF_df_pro.Y)
N = len(tmp_X)

XX = [[tmp_X[ii], tmp_Y[ii]] for ii in range(0,N)]

# K: from 2 till 6 clusters
K_range = range(2,7)

# Dict which can summaries everything
KNN_dict = {}
for K in K_range:
    # for each each inital K [ii][][]
    KNN_dict[K] = {}
    KNN_dict[K]['fit'] = KMeans(n_clusters = K, random_state=0).fit(XX)

    KNN_dict[K]['c_center'] = {}
    KNN_dict[K]['labels'] = {}
    
    tmp_labels = list(KNN_dict[K]['fit'].labels_)
    for kk in range(0,K):
        KNN_dict[K]['c_center'][kk] = {}
        KNN_dict[K]['c_center'][kk]['lon'] = KNN_dict[K]['fit'].cluster_centers_[kk][0] # lon
        KNN_dict[K]['c_center'][kk]['lat'] = KNN_dict[K]['fit'].cluster_centers_[kk][1] # lat
        
        KNN_dict[K]['labels'][kk] = {}
        KNN_dict[K]['labels'][kk]['lon'] = [tmp_X[ii] for ii in range(0,N) if tmp_labels[ii] == kk]
        KNN_dict[K]['labels'][kk]['lat'] = [tmp_Y[ii] for ii in range(0,N) if tmp_labels[ii] == kk]

In [32]:
# Create geojson for coordinates
def createGEOJSON(K):
    # Colors for each cluster, max six clusters
    cluster_color = ["rgba(255,0,255,0.5)",
                     "rgba(0,0,255,0.5)",
                     "rgba(0,255,0,0.5)",
                     "rgba(255,255,0,0.5)",
                     "rgba(255,0,0,0.5)",
                     "rgba(0,255,255,0.5)"]
    
    tmp = [] # tmp list
    # capture number of obs.
    no_obs = sum([len(KNN_dict[K]['labels'][kk]['lon']) for kk in range(0,K)])
    
    # Loop through each assigned coordinates for each cluster
    for kk in range(0,K):
        # Append coordinates 
        tmp_len = len(KNN_dict[K]['labels'][kk]['lon'])
       
        for jj in range(0,tmp_len):
            tmp.append({'lon': KNN_dict[K]['labels'][kk]['lon'][jj],
                        'lat': KNN_dict[K]['labels'][kk]['lat'][jj],
                        'col': kk,
                        'r': 3,
                        'farve': cluster_color[kk],
                        'stroke': cluster_color[kk],
                        'no': tmp_len,
                        'ratio': "{0:.2f}".format(tmp_len / no_obs * 100)})
    
    # Place centroids at the end of the list
    for kk in range(0,K):
        tmp_len = len(KNN_dict[K]['labels'][kk]['lon'])
        tmp.append({'lon': KNN_dict[K]['c_center'][kk]['lon'],
                    'lat': KNN_dict[K]['c_center'][kk]['lat'],
                    'col': kk,
                    'r': 8,
                    'stroke': "rgba(0,0,0,1)",
                    'farve': cluster_color[kk],
                    'no': tmp_len,
                    'ratio': "{0:.2f}".format(tmp_len / no_obs * 100)})
    return(tmp)


In [33]:
# K = 2
assignment2DJSON_K2 = createGEOJSON(2)
createJSON(assignment2DJSON_K2, 'data/assignment2DJSON_K2.json')

# K = 3
assignment2DJSON_K3 = createGEOJSON(3)
createJSON(assignment2DJSON_K3, 'data/assignment2DJSON_K3.json')

# K = 4
assignment2DJSON_K4 = createGEOJSON(4)
createJSON(assignment2DJSON_K4, 'data/assignment2DJSON_K4.json')

# K = 5
assignment2DJSON_K5 = createGEOJSON(5)
createJSON(assignment2DJSON_K5, 'data/assignment2DJSON_K5.json')

## K = 6
assignment2DJSON_K6 = createGEOJSON(6)
createJSON(assignment2DJSON_K6, 'data/assignment2DJSON_K6.json')
