In [174]:
#read data from https://data.cityofchicago.org/Education/Libraries-2011-Visitors-by-Location/xxwy-zyzu
# googled this data in https://toolbox.google.com/datasetsearch
# Note that in this data set the visit distribution is temporal (across months)
# the same process can be replicated if visit distribution is across users, user segments, users+monthly combination etc

import pandas as pd 
import numpy as np
import math
#data = pd.read_csv("userVisits-ijcai15/userVisits-Buda-allPOI.csv") #We can also choose other csv file for the testing e.g., UserVisits-Toro
data = pd.read_csv("/Users/tarun/Downloads/Libraries_-_2011_Visitors_by_Location.csv")

#data look like this- an aggregate user visits across month at a given location
data.head()


Unnamed: 0,LOCATION,ADDRESS,CITY,ZIP CODE,JANUARY,FEBRUARY,MARCH,APRIL,MAY,JUNE,JULY,AUGUST,SEPTEMBER,OCTOBER,NOVEMBER,DECEMBER,YTD
0,Albany Park,5150 N. Kimball Avenue,CHICAGO,60625.0,9604.0,10500.0,9050.0,9300.0,8271.0,10984,9986,11078,9453,10213,9377,9609,117425.0
1,Altgeld,13281 S. Corliss Avenue,CHICAGO,60827.0,5809.0,3899.0,5207.0,5201.0,4494.0,5760,3653,2414,4552,6891,5698,5079,58657.0
2,Archer Heights*,5055 S. Archer Avenue,CHICAGO,60632.0,9829.0,9394.0,11342.0,11114.0,9365.0,11247,10329,11231,10373,11364,10011,9054,124653.0
3,Austin,5615 W. Race Avenue,CHICAGO,60644.0,6713.0,6250.0,7054.0,9139.0,8857.0,9586,8352,10359,9151,10016,8461,8368,102306.0
4,Austin-Irving,6100 W. Irving Park Road,CHICAGO,60634.0,11556.0,9904.0,13214.0,13064.0,10969.0,12587,12596,13638,12542,13286,11868,10628,145852.0


In [175]:
# few basic preprocessing
# 1. to replace NaN with0
# 2. add a place identifier
data=data.fillna(0)
data['pid']=pd.Series(range(1,(data.shape[0])))
data.tail(3)

Unnamed: 0,LOCATION,ADDRESS,CITY,ZIP CODE,JANUARY,FEBRUARY,MARCH,APRIL,MAY,JUNE,JULY,AUGUST,SEPTEMBER,OCTOBER,NOVEMBER,DECEMBER,YTD,pid
78,Wrightwood-Ashburn*,8530 S. Kedzie Avenue,CHICAGO,60652.0,6546.0,5879.0,7460.0,7018.0,6297.0,8396,9288,8048,8398,8678,7056,6516,89580.0,79.0
79,Total,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,80.0
80,0,0,0,0.0,895715.0,785431.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,


In [176]:
# now melting the data frame to get the visit distribution across month for every place
df = pd.melt(data, id_vars=['LOCATION','ADDRESS','CITY','ZIP CODE','YTD','pid'], 
             var_name='month', value_name='Visits')

df.head(3)

# checking if numerical visits can be summed if it cant be summed, it means there is string somewhere
df.Visits.sum()

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [177]:
#we do find that there were strings so replacing it
df.Visits = df.Visits.convert_objects(convert_numeric=True)



For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  


In [178]:
def Entropy(givenPlaceid, df):
    l = df.loc[df.pid==givenPlaceid,].Visits.sum()
    if(l==0):
        l=1
    entropy = 0
    for givenMonth in df.month.unique():
        c = df.loc[(df.pid==givenPlaceid) & (df.month==givenMonth),].Visits.sum()
        if(c==0):
            c=1
        p = c/l
        entropy = entropy+p*math.log(p)
    entropy = entropy*(-1)
    return entropy




print("location_id", "Entropy")
for placeid in df.pid.unique():
    try:
        print(placeid, Entropy(placeid, df))
    except Exception as e:
        print(e)

    
#print(placeid, calculateEntropy(placeid, df))

('location_id', 'Entropy')
(1.0, 2.4817459496354504)
(2.0, 2.4561964646129133)
(3.0, 2.4817542864859039)
(4.0, 2.4741266761503842)
(5.0, 2.4805270179474581)
(6.0, 2.4716830970522894)
(7.0, 1.9384890097502456)
(8.0, 2.4771403500643552)
(9.0, 2.4731463087346053)
(10.0, 2.4789052328640087)
(11.0, 2.4351171575932735)
(12.0, 2.4763162571682402)
(13.0, 2.4822581348514219)
(14.0, 2.4785605033720266)
(15.0, 2.4570228924257829)
(16.0, 2.4766798503316467)
(17.0, 2.4762725303934014)
(18.0, 2.4793919591306683)
(19.0, 2.4713602351155326)
(20.0, 2.4679069358290429)
(21.0, 2.4803764016123178)
(22.0, 1.7832193831997407)
(23.0, 2.4668151022616307)
(24.0, 2.0743816253345679)
(25.0, 2.4675091248453684)
(26.0, 1.5945744517604314)
(27.0, 2.4698849451156666)
(28.0, 2.4326880920176248)
(29.0, 2.4701328505290112)
(30.0, 2.0585199265700034)
(31.0, 2.4687275056676863)
(32.0, 2.4816406241268583)
(33.0, 2.4695195695859762)
(34.0, 2.47724648616513)
(35.0, 2.4725011801147727)
(36.0, 2.4753407757884625)
(37.0, 2.466

In [179]:
# Unit test

# subsequent code is just an inference & common sensical check that serves as unit test
# we start with an intuition that entropy might be low for parks & high if place is not like park & more like street
# used to check if the entropy is atleast dissimilar 
# But we dont have places categorization, so we look at if we are able to tag places & derive it from the place description

In [180]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/tarun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [181]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tarun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [182]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/tarun/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [183]:
nltk.download('words')


[nltk_data] Downloading package words to /Users/tarun/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [184]:
#remove unwanted charectars from location

def stringify(location):
    try:
        pattern =re.compile('([^\s\w]|_)+')
        normalized_location=re.sub(pattern, '', location)
    except Exception as e:
        normalized_location='dummy'
    return(normalized_location)

df.LOCATION=df.LOCATION.map(lambda x:stringify(x))

In [221]:
# Now to check what categories of places did we get

from nltk import word_tokenize, pos_tag, ne_chunk
import numpy as np


def placeCategorizer(givenLocation):
    place_type_list=[]
    place_type_string=''
    try:
        for s in ne_chunk(pos_tag(word_tokenize(givenLocation))):
            place_type=str(s.flatten).split(' ')[4].strip(',\'').replace('Tree(\'','')
            place_type_list.append(place_type)
            place_type_string=place_type_string+'_'+place_type
    except Exception as e:
        place_type_list=[]
        place_type_string=''
    return(place_type_list)

df['LOCATION_Category']=df.LOCATION.map(lambda x:placeCategorizer(x))

# Here is the distribution of tags we got for the places,
# GPE	492
# GSP	12
# ORGANIZATION	324
# PERSON	287

#by observation we can clearly see ORGANIZATION stands for parks & we could treat the rest as non organization

In [222]:
df.head()


Unnamed: 0,LOCATION,ADDRESS,CITY,ZIP CODE,YTD,pid,month,Visits,LOCATION_Category
0,Albany Park,5150 N. Kimball Avenue,CHICAGO,60625.0,117425.0,1.0,JANUARY,9604.0,"[PERSON, ORGANIZATION]"
1,Altgeld,13281 S. Corliss Avenue,CHICAGO,60827.0,58657.0,2.0,JANUARY,5809.0,[GPE]
2,Archer Heights,5055 S. Archer Avenue,CHICAGO,60632.0,124653.0,3.0,JANUARY,9829.0,[]
3,Austin,5615 W. Race Avenue,CHICAGO,60644.0,102306.0,4.0,JANUARY,6713.0,[GPE]
4,AustinIrving,6100 W. Irving Park Road,CHICAGO,60634.0,145852.0,5.0,JANUARY,11556.0,[]


In [224]:
# for simplicity of doing unit test lets tag ORGANIZATION as parks- as observed from the data output
# and categorize non ORGANIZATION as non parks
def park_nonpark(givenLOCATION_Category):
    if('ORGANIZATION' in givenLOCATION_Category):
        return(1)
    else:
        return(0)
    
df['park_nonpark']= df.LOCATION_Category.map(lambda x:park_nonpark(x))
df.head()

Unnamed: 0,LOCATION,ADDRESS,CITY,ZIP CODE,YTD,pid,month,Visits,LOCATION_Category,park_nonpark
0,Albany Park,5150 N. Kimball Avenue,CHICAGO,60625.0,117425.0,1.0,JANUARY,9604.0,"[PERSON, ORGANIZATION]",1
1,Altgeld,13281 S. Corliss Avenue,CHICAGO,60827.0,58657.0,2.0,JANUARY,5809.0,[GPE],0
2,Archer Heights,5055 S. Archer Avenue,CHICAGO,60632.0,124653.0,3.0,JANUARY,9829.0,[],0
3,Austin,5615 W. Race Avenue,CHICAGO,60644.0,102306.0,4.0,JANUARY,6713.0,[GPE],0
4,AustinIrving,6100 W. Irving Park Road,CHICAGO,60634.0,145852.0,5.0,JANUARY,11556.0,[],0


In [227]:
# adding the entropies to df
def Entropy(givenPlaceid, df):
    l = df.loc[df.pid==givenPlaceid,].Visits.sum()
    if(l==0):
        l=1
    entropy = 0
    for givenMonth in df.month.unique():
        c = df.loc[(df.pid==givenPlaceid) & (df.month==givenMonth),].Visits.sum()
        if(c==0):
            c=1
        p = c/l
        entropy = entropy+p*math.log(p)
    entropy = entropy*(-1)
    return entropy

places_summary=pd.DataFrame()


entropy_list=[]

for i,r in df.iterrows():
    try:
        entropy_list.append(Entropy(r.pid, df))
    except Exception as e:
        print(e)

df['location_entropy']=pd.Series(entropy_list)
df.head()

Unnamed: 0,LOCATION,ADDRESS,CITY,ZIP CODE,YTD,pid,month,Visits,LOCATION_Category,park_nonpark,location_entropy
0,Albany Park,5150 N. Kimball Avenue,CHICAGO,60625.0,117425.0,1.0,JANUARY,9604.0,"[PERSON, ORGANIZATION]",1,2.481746
1,Altgeld,13281 S. Corliss Avenue,CHICAGO,60827.0,58657.0,2.0,JANUARY,5809.0,[GPE],0,2.456196
2,Archer Heights,5055 S. Archer Avenue,CHICAGO,60632.0,124653.0,3.0,JANUARY,9829.0,[],0,2.481754
3,Austin,5615 W. Race Avenue,CHICAGO,60644.0,102306.0,4.0,JANUARY,6713.0,[GPE],0,2.474127
4,AustinIrving,6100 W. Irving Park Road,CHICAGO,60634.0,145852.0,5.0,JANUARY,11556.0,[],0,2.480527


In [228]:
#df[['park_nonpark','location_entropy']].groupby(['park_nonpark'])['location_entropy'].size().reset_index(name='averageentropy')
pd.pivot_table(df,index=['park_nonpark'],values=["location_entropy"],aggfunc=np.mean)

Unnamed: 0_level_0,location_entropy
park_nonpark,Unnamed: 1_level_1
0,2.33063
1,2.41854
