In [1]:
import numpy as np
import os
import pandas as pd
import geopandas as gpd
import shapely
from scipy.spatial.distance import cdist

In [2]:
class KMeans(object):
    def __init__(self, k=8, euclid = True):
        self.k = k
        if (euclid):
            self._distance = 'euclidean'
        else:
            self._distance = self._distance_haversine
    
    def _step(self):
        """Compute distance, assign groups, recompute centers"""
        distance = cdist(self.X,self.cluster_centers,metric=self._distance)
        self.labels = distance.argmin(1)
       # centers = np.zeros((self.k,2))
        for cluster in range(self.k):
            points = self.X[self.labels == cluster]
            if len(points) == 0:
                distance = cdist(self.X,np.delete(self.cluster_centers,cluster,0),metric=self._distance)
                mean_dist = np.mean(distance,0)
                self.cluster_centers[cluster] = mean_dist.argmax()
            else:
                self.cluster_centers[cluster] = np.mean(points,0)
       # self.cluster_centers = centers
        
    def _distance_haversine(self,a,b):
        lat_1, lon_1, lat_2, lon_2 = map(np.radians,[a[0],a[1],b[0],b[1]])
        d_lat = lat_2 - lat_1
        d_lon = lon_2 - lon_1
        
        arc = np.sin(d_lat/2.0)**2 + np.cos(lat_1)*np.cos(lat_2)*np.sin(d_lon/2)**2
        
        c = 2 * np.arcsin(np.sqrt(arc))
        km = 6372.8 * c
        return km
    
    def _init_centers(self, X):
        unique = np.unique(X, axis=0)
        index = np.random.permutation(len(unique))[:self.k]
        return unique[index]
    
    def fit(self,X, centers = None):
        '''Expects centers to be inputted, if not random'''
        self.labels = np.zeros(len(X))
        self.X = X
        if centers is not None:
            self.cluster_centers = centers 
        else:
            self.cluster_centers = self._init_centers(X)
        old_centers = np.zeros((self.k,2))
    #    self.i = 0
        while(not np.array_equal(old_centers, self.cluster_centers)):
            old_centers = self.cluster_centers.copy()
            self._step()
         #   self.i+=1

In [3]:
from shapely.geometry import Point
from geopandas import GeoDataFrame

demographics = gpd.read_file('./census.geoJSON')

def gen_coords(loc):
    data = loc[1:-1].split(',')
    data = list((np.float(data[0]), np.float(data[1])))
    x.append(data[1])
    y.append(data[0])
    return [data[0],data[1]]

def point_similarity(X,geo_labels, euc_labels,k):
    '''For an inputted series of points, geodesic labels, euclidean labels, and k-value
       returns the point-similarity index per geodesic cluster
    '''

    euc_cluster_totals = np.zeros(k,dtype=np.int)
    geo_euc_composition = [np.zeros(k,dtype=np.int)* 1 for i in range(k)]
    
    for index,point in enumerate(geo_labels):
        euc_cluster_totals[euc_labels[index]] += 1
        geo_euc_composition[point][euc_labels[index]] += 1
    
    point_sim = []
    for geo_cluster in range(k):
        sim = 0
        for euc_cluster in range(k):
            matching_points = geo_euc_composition[geo_cluster][euc_cluster]
            euc_percentage = matching_points / euc_cluster_totals[euc_cluster]
            geo_percentage = matching_points / np.sum(geo_euc_composition[geo_cluster])
            sim += euc_percentage * geo_percentage
        point_sim.append(sim)

    return np.array(point_sim)

def minority_probability(X,cluster_number,geo_labels,demographics):
        points = X[geo_labels == cluster_number]
        # geoJSON puts points in Long/Lat order
        # but points are in lat/long earlier
        hull = shapely.geometry.multipoint.MultiPoint([[p[1],p[0]] for p in points]).convex_hull
  
        pop = np.zeros(7)
        for index in range(len(demographics)):
            census_tract = demographics.loc[index,'geometry']
            intersect = hull.intersection(census_tract)
            overlap = intersect.area/census_tract.area
            if (overlap != 0):
                pop = pop + (np.array(demographics.loc[index,['White','Black or African American', 'American Indian and Ala Native',
                   'Asian','Native Hawaiian/other Pac Isl', 'Multiple Race',
                   'Other Race']]) * overlap)
        
        if (np.all(pop ==0)):
            return 0
        
        return (pop[1:]/np.sum(pop)).sum()

def bias_index(X, geo_labels, euc_labels, demographics, k):
    if np.all(geo_labels == euc_labels):
        return 0

    dissimilarity_index = 1 - point_similarity(X,geo_labels,euc_labels,k)
    minority_prob = np.array([minority_probability(X,cluster,geo_labels,demographics) 
                              for cluster in range(k)])
    
    potential_bias = minority_prob * dissimilarity_index
    return potential_bias.mean()


In [4]:
columns = ['year','k','t_jan','t_feb','t_mar','t_april', 't_may', 't_june', 't_july',
          't_aug', 't_sep','t_oct', 't_nov', 't_dec','m_jan','m_feb','m_mar','m_april', 
          'm_may', 'm_june', 'm_july', 'm_aug', 'm_sep','m_oct', 'm_nov', 'm_dec']



frame_list = []
for year in range(2005,2017):
    for k in range(2,11):
        year_list = [str(year)]
        year_list.append(k)
        for _ in range(24):
            year_list.append(0.00) 
        frame_list.append(year_list)
    
bias_frame = pd.DataFrame(data=frame_list, columns=columns)


In [5]:
def store_bias(folder,file,bias_value,k):
    year_string = folder.split('data_')[1]
    year_array = np.array(bias_frame.year == year_string) 
    k_array = np.array(bias_frame.k == k)
    index = np.logical_and(year_array,k_array)
    month_string = file.split('.csv')[0]
    prefix,month = month_string.split('theft_')
    if len(prefix) == 0:
        month_index = 't_' + month
    else:
        month_index = prefix + month
    bias_frame.loc[index,month_index] = bias_value

In [6]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
import os
import warnings

def gen_coords(loc):
    data = loc[1:-1].split(',')
    data = list((np.float(data[0]), np.float(data[1])))
    x.append(data[1])
    y.append(data[0])
    return [data[0],data[1]]

def percent_similarity(a,b):
    return len(a[a==b])/len(a)

for year in range(2005,2017):
    folder = 'data_' + str(year)
    for file in os.listdir('../data/' + folder):
        if(file.endswith('.csv')):
            df = pd.read_csv('../data/' + folder +'/' + file, sep =';')
        
            x = []
            y = []

            df['Points'] = df['Location'].apply(gen_coords)
            points = [Point(xy) for xy in zip(x,y)]
            crs = {'init': 'epsg:4326'}
            geo_df = GeoDataFrame(df,crs=crs, geometry=points)
            theft_both = geo_df.copy()
            test_list = []

            for index in range(len(theft_both)):
                test_list.append(df.loc[index, 'Points'])

            X = np.array(test_list)

            for k in range(2,11):
                euclid = KMeans(k = k)
                geodesic = KMeans(k = k, euclid = False)
                centers = geodesic._init_centers(X)

                euclid.fit(X,centers = centers)
                geodesic.fit(X,centers = centers) 

                bias_val = bias_index(X, geodesic.labels, euclid.labels, demographics, k)
              #  print(folder,file,bias_val,k)
                store_bias(folder,file,bias_val,k)
                theft_both.loc[:,'e_cluster' + 'K' + str(k)] = euclid.labels.copy()
                theft_both.loc[:,'g_cluster' + 'K' + str(k)] = geodesic.labels.copy()
                


              #  print(percent_similarity(euclid.labels, geodesic.labels))



            theft_both = theft_both.drop('Points', axis=1)

            try:
                os.remove('./datamound/'+ folder + '-' + file.split('.csv')[0] + '.js')
            except FileNotFoundError:
                pass

            theft_both.to_file('./datamound/'+ folder + '-' +file.split('.csv')[0] + '.js', driver='GeoJSON')
    #         with open('./datamound/'+file.split('.csv')[0] + '.js', 'r') as original: data = original.read()
    #         with open('./datamound/'+file.split('.csv')[0] + '.js', 'w') as modified: modified.write('var both =' 
    #                                                         + data +';')
            print('./datamound/'+ folder + '-' +file.split('.csv')[0] + '.js')
          #  print('-------')
        


KeyboardInterrupt: 

In [10]:
bias_frame

Unnamed: 0,year,k,t_jan,t_feb,t_mar,t_april,t_may,t_june,t_july,t_aug,...,m_mar,m_april,m_may,m_june,m_july,m_aug,m_sep,m_oct,m_nov,m_dec
0,2005,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.090658,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2005,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.079655,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2005,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.105614,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2005,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2005,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2005,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2005,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2005,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2005,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2006,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import os
data = 'var dataB = ['
for year in range(2011,2017):
    
    ordered_names = ['theft_jan.js','theft_feb.js','theft_mar.js','theft_april.js',
                'theft_may.js','theft_june.js','theft_july.js','theft_aug.js',
                'theft_sep.js','theft_oct.js','theft_nov.js','theft_dec.js',
                'm_theft_jan.js','m_theft_feb.js','m_theft_mar.js','m_theft_april.js',
                'm_theft_may.js','m_theft_june.js','m_theft_july.js','m_theft_aug.js',
                'm_theft_sep.js','m_theft_oct.js','m_theft_nov.js','m_theft_dec.js']
    year_string = 'data_'+str(year)+'-'

    for file in ordered_names:
        reader = open('./datamound/'+ year_string + file,'r')
        data += (reader.read() + ',')
        reader.close()
        print(file)
        
    writer = open('halfB.js','w')
    writer.write(data + '];')
    writer.close()

    
data = 'var dataA = ['
for year in range(2005,2011):

    ordered_names = ['theft_jan.js','theft_feb.js','theft_mar.js','theft_april.js',
                'theft_may.js','theft_june.js','theft_july.js','theft_aug.js',
                'theft_sep.js','theft_oct.js','theft_nov.js','theft_dec.js',
                'm_theft_jan.js','m_theft_feb.js','m_theft_mar.js','m_theft_april.js',
                'm_theft_may.js','m_theft_june.js','m_theft_july.js','m_theft_aug.js',
                'm_theft_sep.js','m_theft_oct.js','m_theft_nov.js','m_theft_dec.js']
    year_string = 'data_'+str(year)+'-'

    for file in ordered_names:
        reader = open('./datamound/'+ year_string + file,'r')
        data += (reader.read() + ',')
        reader.close()
        print(file)

    writer = open('halfA.js','w')
    writer.write(data + '];')
    writer.close()


In [11]:
bias_frame.to_json('bias.js')
with open('bias.js','r') as reader:
    data = reader.read()
with open('bias.js','w') as w:
    w.write('var bias_data =' + data + ';')
    