In [6]:
import numpy as np

In [7]:
class KMeans(object):
    
    def __init__(self, n_clusters = 8, dist = 'Euclid'):
        self.n_clusters = n_clusters
        self.cluster_centers = np.zeros((n_clusters,2))
        if dist == 'Euclid':
            self._distance = self._distance_euclid
        elif dist == 'Geodesic':
            self._distance = self._distance_haversine
        
    
    def _has_converged(self,old_centers, new_centers):
        return np.array_equal(old_centers, new_centers)
    
    def _compute_clusters(self, X):
        cluster_list = np.zeros((len(X)),dtype=np.int)
        for i,x in enumerate(X):
            cluster_list[i] = np.argmin(
                np.array([self._distance(x,self.cluster_centers[k])
                          for k in range(self.n_clusters)]))
        return cluster_list
        
    def _recompute_centers(self, X):
        #centers = self.cluster_centers.copy()
        centers = np.zeros((self.n_clusters,2))
        for k in range(self.n_clusters):
            points = []
            for index, item in enumerate(self.labels):
#                 print(f'Index = {index}')
#                 print(f'Item = {item}')
                if(item == k):
#                     print(f'Point = {X[index]}')
                    points.append(X[index])
            points = np.array(points)
      #      print(f'Mean: {np.mean(points, axis=0)}')
            if(points.size == 0):
                print(self.cluster_centers)
                print(self.n_clusters)
                print(self.labels)
                print(np.mean(points, axis=0))
            centers[k] = np.mean(points, axis=0)
          #  centers.append(np.mean(points,axis=0))
       # print(centers)
        return centers
        
    def _distance_euclid(self, x,y):
        return np.linalg.norm(np.subtract(x,y))
    
    """
    Expects points to be of the form lat,lon
    """
    def _distance_haversine(self,x,y):
        lat_1, lon_1, lat_2, lon_2 = map(np.radians,[x[0],x[1],y[0],y[1]])
        d_lat = lat_2 - lat_1
        d_lon = lon_2 - lon_1
        
        a = np.sin(d_lat/2.0)**2 + np.cos(lat_1)*np.cos(lat_2)*np.sin(d_lon/2)**2
        
        c = 2 * np.arcsin(np.sqrt(a))
        km = 6372.8 * c
        return km
   
    def _initialize_centers(self,X):
        centers = []
        length = len(X)
        for k in range(self.n_clusters):
            index = np.random.randint(0,length)
            while (tuple(X[index]) in set(map(tuple,centers))):
                index = np.random.randint(0,length)
            centers.append(X[index])
        return np.array(centers)


    def fit(self,X):
        self.X = X
        old_centers = self.cluster_centers.copy()
        self.cluster_centers = self._initialize_centers(X)
       # print(f'Initialized Clusters: {self.cluster_centers}')
        while(not self._has_converged(old_centers, self.cluster_centers.copy())):
            old_centers = self.cluster_centers.copy()
            self.labels = self._compute_clusters(X)
            self.cluster_centers = self._recompute_centers(X)
            
    def fit_from_starting_points(self,X,starting_clusters):
        self.X = X
        old_centers = self.cluster_centers.copy()
        self.cluster_centers = starting_clusters
        while(not self._has_converged(old_centers, self.cluster_centers.copy())):
            old_centers = self.cluster_centers.copy()
            self.labels = self._compute_clusters(X)
            self.cluster_centers = self._recompute_centers(X)
            
    def avg_distance(self):
        centers = self.cluster_centers
        k = self.n_clusters
        avg_distance = 0
        for cluster in range(k):
            total = 0
            dist = 0
            for index, point in enumerate(self.labels):
                if (point == cluster):
                    total += 1
                    dist += self._distance(self.X[index], centers[cluster])
            avg_distance += dist / total
        avg_distance = avg_distance / k
        return avg_distance
                
            
    

In [8]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
import os
import warnings

def gen_coords(loc):
    data = loc[1:-1].split(',')
    data = list((np.float(data[0]), np.float(data[1])))
    x.append(data[1])
    y.append(data[0])
    return [data[0],data[1]]

def percent_similarity(set_1, set_2):
    count = 0
    for i in range (len(set_1)):
        if set_1[i] == set_2[i]:
            count += 1
    return count / len(set_1)


for file in os.listdir('../data_2015/'):
    if(file.endswith('.csv')):
        df = pd.read_csv('../data_2015/' + file, sep =';')
        
        x = []
        y = []

        df['Points'] = df['Location'].apply(gen_coords)
        points = [Point(xy) for xy in zip(x,y)]
        crs = {'init': 'epsg:4326'}
        geo_df = GeoDataFrame(df,crs=crs, geometry=points)
        theft_both = geo_df.copy()

        X = np.array(theft_both['Points'])

        for k in range(2,11):
            while(True):
                kmeans_theft_euclid = KMeans(n_clusters = k)
                kmeans_theft_geodesic = KMeans(n_clusters = k, dist = 'Geodesic')
                centers = kmeans_theft_geodesic._initialize_centers(X)

                kmeans_theft_euclid.fit_from_starting_points(X,centers)
                kmeans_theft_geodesic.fit_from_starting_points(X,centers) 
                if(percent_similarity(kmeans_theft_euclid.labels,
                                     kmeans_theft_geodesic.labels) > .50):
                    break

            theft_both.loc[:,'e_cluster' + 'K' + str(k)] = kmeans_theft_euclid.labels.copy()
            theft_both.loc[:,'g_cluster' + 'K' + str(k)] = kmeans_theft_geodesic.labels.copy()

                
            print(percent_similarity(kmeans_theft_euclid.labels,
                                     kmeans_theft_geodesic.labels))

            
            
        theft_both = theft_both.drop('Points', axis=1)

        try:
            os.remove('./datamound/'+file.split('.csv')[0] + '.js')
        except FileNotFoundError:
            pass
        
        theft_both.to_file('./datamound/'+file.split('.csv')[0] + '.js', driver='GeoJSON')
#         with open('./datamound/'+file.split('.csv')[0] + '.js', 'r') as original: data = original.read()
#         with open('./datamound/'+file.split('.csv')[0] + '.js', 'w') as modified: modified.write('var both =' 
#                                                         + data +';')
        print(file)
        print('-------')


0.9481132075471698
0.9386792452830188
0.8584905660377359
0.6556603773584906
0.9056603773584906
0.8325471698113207
0.9245283018867925
0.8254716981132075
0.7735849056603774
m_theft_april.csv
-------
0.9762282091917591
0.9207606973058637
0.9461172741679873
0.8858954041204438
0.9603803486529319
0.901743264659271
0.6656101426307448
0.8510301109350238
0.6846275752773375
m_theft_aug.csv
-------
0.9694960212201591
0.9748010610079576
0.9416445623342176
0.8527851458885941
0.6949602122015915
0.9310344827586207
0.9283819628647215
0.656498673740053
0.7228116710875332
m_theft_dec.csv
-------
0.9767441860465116
0.9631782945736435
0.8178294573643411
0.5251937984496124
0.7441860465116279
0.9321705426356589
0.7073643410852714
0.8449612403100775
0.5077519379844961
m_theft_feb.csv
-------
0.979746835443038
0.9316455696202531
0.9392405063291139
0.9012658227848102
0.8987341772151899
0.9341772151898734
0.830379746835443
0.7518987341772152
0.5721518987341773
m_theft_jan.csv
-------
0.9490740740740741
0.956790

In [9]:
import os
ordered_names = ['theft_jan.js','theft_feb.js','theft_mar.js','theft_april.js',
                'theft_may.js','theft_june.js','theft_july.js','theft_aug.js',
                'theft_sep.js','theft_oct.js','theft_nov.js','theft_dec.js',
                'm_theft_jan.js','m_theft_feb.js','m_theft_mar.js','m_theft_april.js',
                'm_theft_may.js','m_theft_june.js','m_theft_july.js','m_theft_aug.js',
                'm_theft_sep.js','m_theft_oct.js','m_theft_nov.js','m_theft_dec.js']
data = 'var complete_data =['
for file in ordered_names:
    reader = open('./datamound/' + file,'r')
    data += (reader.read() + ',')
    reader.close()
    print(file)
        
writer = open('all.js','w')
writer.write(data + '];')
writer.close()

theft_jan.js
theft_feb.js
theft_mar.js
theft_april.js
theft_may.js
theft_june.js
theft_july.js
theft_aug.js
theft_sep.js
theft_oct.js
theft_nov.js
theft_dec.js
m_theft_jan.js
m_theft_feb.js
m_theft_mar.js
m_theft_april.js
m_theft_may.js
m_theft_june.js
m_theft_july.js
m_theft_aug.js
m_theft_sep.js
m_theft_oct.js
m_theft_nov.js
m_theft_dec.js
