In [1]:
import numpy as np
import os
import pandas as pd
import geopandas as gpd
import shapely
from scipy.spatial.distance import cdist

In [2]:
class KMeans(object):
    def __init__(self, k=8, euclid = True):
        self.k = k
        if (euclid):
            self._distance = 'euclidean'
        else:
            self._distance = self._distance_haversine
    
    def _step(self):
        """Compute distance, assign groups, recompute centers"""
        distance = cdist(self.X,self.cluster_centers,metric=self._distance)
        self.labels = distance.argmin(1)
       # centers = np.zeros((self.k,2))
        for cluster in range(self.k):
            points = self.X[self.labels == cluster]
            if len(points) == 0:
                distance = cdist(self.X,np.delete(self.cluster_centers,cluster,0),metric=self._distance)
                mean_dist = np.mean(distance,0)
                self.cluster_centers[cluster] = mean_dist.argmax()
            else:
                self.cluster_centers[cluster] = np.mean(points,0)
       # self.cluster_centers = centers
        
    def _distance_haversine(self,a,b):
        lat_1, lon_1, lat_2, lon_2 = map(np.radians,[a[0],a[1],b[0],b[1]])
        d_lat = lat_2 - lat_1
        d_lon = lon_2 - lon_1
        
        arc = np.sin(d_lat/2.0)**2 + np.cos(lat_1)*np.cos(lat_2)*np.sin(d_lon/2)**2
        
        c = 2 * np.arcsin(np.sqrt(arc))
        km = 6372.8 * c
        return km
    
    def _init_centers(self, X):
        unique = np.unique(X, axis=0)
        index = np.random.permutation(len(unique))[:self.k]
        return unique[index]
    
    def fit(self,X, centers = None):
        '''Expects centers to be inputted, if not random'''
        self.labels = np.zeros(len(X))
        self.X = X
        if centers is not None:
            self.cluster_centers = centers 
        else:
            self.cluster_centers = self._init_centers(X)
        old_centers = np.zeros((self.k,2))
    #    self.i = 0
        while(not np.array_equal(old_centers, self.cluster_centers)):
            old_centers = self.cluster_centers.copy()
            self._step()
         #   self.i+=1

In [3]:
from shapely.geometry import Point
from geopandas import GeoDataFrame

demographics = gpd.read_file('./census.geoJSON')

def gen_coords(loc):
    data = loc[1:-1].split(',')
    data = list((np.float(data[0]), np.float(data[1])))
    x.append(data[1])
    y.append(data[0])
    return [data[0],data[1]]

def point_similarity(X,geo_labels, euc_labels,k):
    '''For an inputted series of points, geodesic labels, euclidean labels, and k-value
       returns the point-similarity index per geodesic cluster
    '''

    euc_cluster_totals = np.zeros(k,dtype=np.int)
    geo_euc_composition = [np.zeros(k,dtype=np.int)* 1 for i in range(k)]
    
    for index,point in enumerate(geo_labels):
        euc_cluster_totals[euc_labels[index]] += 1
        geo_euc_composition[point][euc_labels[index]] += 1
    
    point_sim = []
    for geo_cluster in range(k):
        sim = 0
        for euc_cluster in range(k):
            matching_points = geo_euc_composition[geo_cluster][euc_cluster]
            euc_percentage = matching_points / euc_cluster_totals[euc_cluster]
            geo_percentage = matching_points / np.sum(geo_euc_composition[geo_cluster])
            sim += euc_percentage * geo_percentage
        point_sim.append(sim)

    return np.array(point_sim)

def minority_probability(X,cluster_number,geo_labels,demographics):
        points = X[geo_labels == cluster_number]
        # geoJSON puts points in Long/Lat order
        # but points are in lat/long earlier
        hull = shapely.geometry.multipoint.MultiPoint([[p[1],p[0]] for p in points]).convex_hull
  
        pop = np.zeros(7)
        for index in range(len(demographics)):
            census_tract = demographics.loc[index,'geometry']
            intersect = hull.intersection(census_tract)
            overlap = intersect.area/census_tract.area
            if (overlap != 0):
                pop = pop + (np.array(demographics.loc[index,['White','Black or African American', 'American Indian and Ala Native',
                   'Asian','Native Hawaiian/other Pac Isl', 'Multiple Race',
                   'Other Race']]) * overlap)
        
        if (np.all(pop ==0)):
            return 0
        
        return (pop[1:]/np.sum(pop)).sum()

def bias_index(X, geo_labels, euc_labels, demographics, k):
    if np.all(geo_labels == euc_labels):
        return 0

    dissimilarity_index = 1 - point_similarity(X,geo_labels,euc_labels,k)
    minority_prob = np.array([minority_probability(X,cluster,geo_labels,demographics) 
                              for cluster in range(k)])
    
    potential_bias = minority_prob * dissimilarity_index
    return potential_bias.mean()


In [4]:
columns = ['year','k','t_jan','t_feb','t_mar','t_april', 't_may', 't_june', 't_july',
          't_aug', 't_sep','t_oct', 't_nov', 't_dec','m_jan','m_feb','m_mar','m_april', 
          'm_may', 'm_june', 'm_july', 'm_aug', 'm_sep','m_oct', 'm_nov', 'm_dec']



frame_list = []
for year in range(2005,2017):
    for k in range(2,11):
        year_list = [str(year)]
        year_list.append(k)
        for _ in range(24):
            year_list.append(0.00) 
        frame_list.append(year_list)
    
bias_frame = pd.DataFrame(data=frame_list, columns=columns)


In [5]:
def store_bias(folder,file,bias_value,k):
    year_string = folder.split('data_')[1]
    year_array = np.array(bias_frame.year == year_string) 
    k_array = np.array(bias_frame.k == k)
    index = np.logical_and(year_array,k_array)
    month_string = file.split('.csv')[0]
    prefix,month = month_string.split('theft_')
    if len(prefix) == 0:
        month_index = 't_' + month
    else:
        month_index = prefix + month
    bias_frame.loc[index,month_index] = bias_value

In [27]:
name = ['Incident number','Date','Time','Police District','Offense 1', 'Offense 2', 'Offense 3',
       'Offense 4', 'Offense 5','Address','Aldermanic District', 'Lat', 'Long']
df = pd.read_csv('../data/yearly_data/ROBBERY/latlng/2005_full.csv', header=None,names=name)
df.Date = pd.to_datetime(df.Date, format='%m/%d/%Y')
df.Date.dt.month == 1

0       False
1        True
2        True
3        True
4        True
5        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
15       True
16       True
17       True
18       True
19       True
20       True
21       True
22       True
23       True
24       True
25       True
26      False
27       True
28       True
29       True
        ...  
2999    False
3000    False
3001    False
3002    False
3003    False
3004    False
3005    False
3006    False
3007    False
3008    False
3009    False
3010    False
3011    False
3012    False
3013    False
3014    False
3015    False
3016    False
3017    False
3018    False
3019    False
3020    False
3021    False
3022    False
3023    False
3024    False
3025    False
3026    False
3027    False
3028    False
Name: Date, Length: 3029, dtype: bool

In [93]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
import os
import warnings

def gen_coords(loc):
    data = loc[1:-1].split(',')
    data = list((np.float(data[0]), np.float(data[1])))
    x.append(data[1])
    y.append(data[0])
    return [data[0],data[1]]

def percent_similarity(a,b):
    return len(a[a==b])/len(a)


for folder in os.listdir('../data/yearly_data/'):
    for file in os.listdir('../data/yearly_data/' + folder + '/latlng'):
        if(file.endswith('.csv')):
            df = pd.read_csv( '../data/yearly_data/' + folder +'/latlng/' + file, sep =',')
            name = ['Incident number','Date','Time','Police District','Offense 1', 'Offense 2', 'Offense 3',
                   'Offense 4', 'Offense 5','Address','Aldermanic District', 'Lat', 'Long']
            df = pd.read_csv('../data/yearly_data/' + folder +'/latlng/' + file, header=None,names=name)
            df.Date = pd.to_datetime(df.Date, format='%m/%d/%Y')
            df = df.drop('Aldermanic District', axis = 1)
      
            for month in range(1,13):
                frame = df[df.Date.dt.month == month].copy()
                lat = frame['Lat'].tolist()
                long = frame['Long'].tolist()
                frame['Location'] = 0
                points = []
                X = []
                frame = frame.reset_index()
                frame = frame.drop('index', axis=1)
                string_dates = []
                for index in range(len(frame)):
                    frame.loc[index,'Location'] = '[' + str(lat[index]) + ', ' + str(long[index]) + ']'
                    xy = (long[index],lat[index])
                    points.append(Point(xy))
                    X.append([lat[index],long[index]])
                    date = frame.loc[index, 'Date']
                    string_dates.append( f'{date.month}/{date.day}/{date.year}')
                    
               # df['Points'] = df['Location'].apply(gen_coords)
               # points = [Point(xy) for xy in zip(lat,long)]
                crs = {'init': 'epsg:4326'}
                geo_df = GeoDataFrame(frame,crs=crs, geometry=points)
                crime = geo_df.copy()
                crime = crime.drop(['Lat','Long'], axis =1)

                X = np.array(X)
                
                for k in range(2,11):
                    euclid = KMeans(k = k)
                    geodesic = KMeans(k = k, euclid = False)
                    centers = geodesic._init_centers(X)

                    euclid.fit(X,centers = centers)
                    geodesic.fit(X,centers = centers) 

                #    bias_val = bias_index(X, geodesic.labels, euclid.labels, demographics, k)
                  #  print(folder,file,bias_val,k)
                #    store_bias(folder,file,bias_val,k)
                    crime.loc[:,'e_cluster' + 'K' + str(k)] = euclid.labels.copy()
                    crime.loc[:,'g_cluster' + 'K' + str(k)] = geodesic.labels.copy()
                    crime.Date = string_dates

                try:
                    os.remove('./justin_datamound/'+ file.split('_full.csv')[0] + '_' + str(month) + '_' + folder  + '.js')
                except FileNotFoundError:
                    pass

                crime.to_file('./justin_datamound/'+ file.split('_full.csv')[0] + '_' + str(month) + '_' + folder  + '.js', driver='GeoJSON')
        #         with open('./datamound/'+file.split('.csv')[0] + '.js', 'r') as original: data = original.read()
        #         with open('./datamound/'+file.split('.csv')[0] + '.js', 'w') as modified: modified.write('var both =' 
        #                                                         + data +';')
                print('./justin_datamound/'+ file.split('_full.csv')[0] + '_' + str(month) + '_' + folder  + '.js')
              #  print('-------')



./justin_datamound/2016_1_ROBBERY.js
./justin_datamound/2016_2_ROBBERY.js
./justin_datamound/2016_3_ROBBERY.js
./justin_datamound/2016_4_ROBBERY.js
./justin_datamound/2016_5_ROBBERY.js
./justin_datamound/2016_6_ROBBERY.js
./justin_datamound/2016_7_ROBBERY.js
./justin_datamound/2016_8_ROBBERY.js
./justin_datamound/2016_9_ROBBERY.js
./justin_datamound/2016_10_ROBBERY.js
./justin_datamound/2016_11_ROBBERY.js
./justin_datamound/2016_12_ROBBERY.js
./justin_datamound/2012_1_ROBBERY.js
./justin_datamound/2012_2_ROBBERY.js
./justin_datamound/2012_3_ROBBERY.js
./justin_datamound/2012_4_ROBBERY.js
./justin_datamound/2012_5_ROBBERY.js
./justin_datamound/2012_6_ROBBERY.js
./justin_datamound/2012_7_ROBBERY.js
./justin_datamound/2012_8_ROBBERY.js
./justin_datamound/2012_9_ROBBERY.js
./justin_datamound/2012_10_ROBBERY.js
./justin_datamound/2012_11_ROBBERY.js
./justin_datamound/2012_12_ROBBERY.js
./justin_datamound/2010_1_ROBBERY.js
./justin_datamound/2010_2_ROBBERY.js
./justin_datamound/2010_3_ROBBER

./justin_datamound/2008_5_SIMPLE ASSAULT.js
./justin_datamound/2008_6_SIMPLE ASSAULT.js
./justin_datamound/2008_7_SIMPLE ASSAULT.js
./justin_datamound/2008_8_SIMPLE ASSAULT.js
./justin_datamound/2008_9_SIMPLE ASSAULT.js
./justin_datamound/2008_10_SIMPLE ASSAULT.js
./justin_datamound/2008_11_SIMPLE ASSAULT.js
./justin_datamound/2008_12_SIMPLE ASSAULT.js
./justin_datamound/2015_1_SIMPLE ASSAULT.js
./justin_datamound/2015_2_SIMPLE ASSAULT.js
./justin_datamound/2015_3_SIMPLE ASSAULT.js
./justin_datamound/2015_4_SIMPLE ASSAULT.js
./justin_datamound/2015_5_SIMPLE ASSAULT.js
./justin_datamound/2015_6_SIMPLE ASSAULT.js
./justin_datamound/2015_7_SIMPLE ASSAULT.js
./justin_datamound/2015_8_SIMPLE ASSAULT.js
./justin_datamound/2015_9_SIMPLE ASSAULT.js
./justin_datamound/2015_10_SIMPLE ASSAULT.js
./justin_datamound/2015_11_SIMPLE ASSAULT.js
./justin_datamound/2015_12_SIMPLE ASSAULT.js
./justin_datamound/2014_1_SIMPLE ASSAULT.js
./justin_datamound/2014_2_SIMPLE ASSAULT.js
./justin_datamound/2014_3_

In [94]:
import os
data = 'var dataB = ['
for year in range(2011,2017):
    for crime in ['ROBBERY', 'SIMPLE ASSAULT']:
        for month in range(1,13):
            file_string = str(year) + '_' + str(month) + '_' + crime + '.js'
    
            reader = open('./justin_datamound/'+ file_string,'r')
            data += (reader.read() + ',')
            reader.close()
            print(file_string)
        
    writer = open('jhalfB.js','w')
    writer.write(data + '];')
    writer.close()
    
data = 'var dataA = ['
for year in range(2005,2011):
    for crime in ['ROBBERY', 'SIMPLE ASSAULT']:
        for month in range(1,13):
            file_string = str(year) + '_' + str(month) + '_' + crime + '.js'
    
            reader = open('./justin_datamound/'+ file_string,'r')
            data += (reader.read() + ',')
            reader.close()
            print(file_string)
        
    writer = open('jhalfA.js','w')
    writer.write(data + '];')
    writer.close()




2011_1_ROBBERY.js
2011_2_ROBBERY.js
2011_3_ROBBERY.js
2011_4_ROBBERY.js
2011_5_ROBBERY.js
2011_6_ROBBERY.js
2011_7_ROBBERY.js
2011_8_ROBBERY.js
2011_9_ROBBERY.js
2011_10_ROBBERY.js
2011_11_ROBBERY.js
2011_12_ROBBERY.js
2011_1_SIMPLE ASSAULT.js
2011_2_SIMPLE ASSAULT.js
2011_3_SIMPLE ASSAULT.js
2011_4_SIMPLE ASSAULT.js
2011_5_SIMPLE ASSAULT.js
2011_6_SIMPLE ASSAULT.js
2011_7_SIMPLE ASSAULT.js
2011_8_SIMPLE ASSAULT.js
2011_9_SIMPLE ASSAULT.js
2011_10_SIMPLE ASSAULT.js
2011_11_SIMPLE ASSAULT.js
2011_12_SIMPLE ASSAULT.js
2012_1_ROBBERY.js
2012_2_ROBBERY.js
2012_3_ROBBERY.js
2012_4_ROBBERY.js
2012_5_ROBBERY.js
2012_6_ROBBERY.js
2012_7_ROBBERY.js
2012_8_ROBBERY.js
2012_9_ROBBERY.js
2012_10_ROBBERY.js
2012_11_ROBBERY.js
2012_12_ROBBERY.js
2012_1_SIMPLE ASSAULT.js
2012_2_SIMPLE ASSAULT.js
2012_3_SIMPLE ASSAULT.js
2012_4_SIMPLE ASSAULT.js
2012_5_SIMPLE ASSAULT.js
2012_6_SIMPLE ASSAULT.js
2012_7_SIMPLE ASSAULT.js
2012_8_SIMPLE ASSAULT.js
2012_9_SIMPLE ASSAULT.js
2012_10_SIMPLE ASSAULT.js
2012_11_

In [9]:
bias_frame.to_json('bias.js',orient='records')
with open('bias.js','r') as reader:
    data = reader.read()
with open('bias.js','w') as w:
    w.write('var bias_data =' + data + ';')
    