# Data Analysis and Visualization for URP 353, Assignment 2

In [1]:
import pandas as pd
import altair as alt
from vega_datasets import data
import scipy

### Global Viz Settings

In [2]:
alt.themes.enable('quartz')

ThemeRegistry.enable('quartz')

## Food Diversity by City

In [3]:
df = pd.read_csv('food_diversity.csv').rename(columns={'Unnamed: 0': 'index'})

In [4]:
df.head()

Unnamed: 0,index,city,category,category_name,M49_country_code,country,total_business_count,sample_rating,sample_review_count
0,0,"New York, New York",afghani,Afghan,,,18,3.75,150.111111
1,1,"New York, New York",african,African,,,48,4.114583,115.645833
2,2,"New York, New York",arabian,Arabian,682.0,Saudi Arabia,5,3.9,51.6
3,3,"New York, New York",argentine,Argentine,32.0,Argentina,21,3.904762,182.857143
4,4,"New York, New York",armenian,Armenian,51.0,Armenia,5,4.5,27.4


### NYC Analysis

In [5]:
NYC = df[df['city'] == 'New York, New York'].groupby(['category_name'])[['city','category_name','total_business_count','sample_rating','sample_review_count']].mean()

In [6]:
NYC['%_of_total'] = NYC['total_business_count'] / NYC['total_business_count'].sum() 

In [7]:
 NYC.sort_values(by = '%_of_total', ascending = False).head(10)

Unnamed: 0_level_0,total_business_count,sample_rating,sample_review_count,%_of_total
category_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese,1700,4.1,1196.06,0.132781
Italian,1400,4.16,1562.34,0.109349
Mexican,1300,4.13,928.66,0.101539
American (New),1100,4.1,1793.86,0.085917
American (Traditional),1100,4.01,1280.82,0.085917
Japanese,939,4.11,1496.8,0.073342
Latin American,677,4.1,900.94,0.052878
Caribbean,545,4.13,339.12,0.042568
Mediterranean,490,4.15,903.42,0.038272
Thai,402,4.11,1103.18,0.031399


Top rated ethnic foods:

In [8]:
 NYC[NYC.total_business_count > 10].sort_values(by = 'sample_rating', ascending = False).head(10)

Unnamed: 0_level_0,total_business_count,sample_rating,sample_review_count,%_of_total
category_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
New Mexican Cuisine,26,4.307692,122.038462,0.002031
Polish,29,4.275862,89.517241,0.002265
Middle Eastern,285,4.2,787.58,0.02226
Australian,13,4.192308,267.846154,0.001015
Halal,369,4.19,639.18,0.028821
Italian,1400,4.16,1562.34,0.109349
Mediterranean,490,4.15,903.42,0.038272
Mexican,1300,4.13,928.66,0.101539
Caribbean,545,4.13,339.12,0.042568
African,48,4.114583,115.645833,0.003749


Most reviewed categories

In [9]:
  NYC[NYC.total_business_count > 10].sort_values(by = 'sample_review_count', ascending = False).head(10)

Unnamed: 0_level_0,total_business_count,sample_rating,sample_review_count,%_of_total
category_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
American (New),1100,4.1,1793.86,0.085917
Italian,1400,4.16,1562.34,0.109349
Japanese,939,4.11,1496.8,0.073342
American (Traditional),1100,4.01,1280.82,0.085917
Chinese,1700,4.1,1196.06,0.132781
French,300,4.09,1151.08,0.023432
Thai,402,4.11,1103.18,0.031399
Korean,229,4.08,952.58,0.017886
Mexican,1300,4.13,928.66,0.101539
Mediterranean,490,4.15,903.42,0.038272


### Visualize NYC Food diversity

In [10]:
viz_nyc = alt.Chart(NYC.reset_index()).mark_point().encode(
    y = alt.Y('%_of_total:Q', axis=alt.Axis(format='.2p', title = 'Percent of Total Businesses') 
    # ,scale = alt.Scale(domain = [0, 0.5])
    ),
    color = alt.Color('category_name:N', legend=None),
    tooltip = [alt.Tooltip('category_name:N', title = 'Ethnic Category'), 
               alt.Tooltip('%_of_total:Q', format='.2%', title = 'Percentage of Total'),
               alt.Tooltip('total_business_count:Q', title = 'Count')]
).interactive( bind_x = False )

In [11]:
viz_nyc

## Compare NYC Food diversity with other major US cities interactively

In [12]:
cities = df.groupby(['city','category_name'])[['city','category_name','total_business_count','sample_rating','sample_review_count']].mean()
print(cities.shape)
cities = cities.join( df.groupby(['city'])[['city','total_business_count']].sum(), on = 'city', rsuffix = '_by_city')
print(cities.shape)
cities['%_of_total'] = cities['total_business_count'] / cities['total_business_count_by_city']

(2672, 3)
(2672, 4)


In [13]:
cities.sample(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_business_count,sample_rating,sample_review_count,total_business_count_by_city,%_of_total
city,category_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Nashville-Davidson, Tennessee",Brazilian,2,4.25,271.0,998,0.002004


### Side-by-side scatter

In [14]:
default_city = {'city':'Ann Arbor, Michigan'}

input_dropdown = alt.binding_select(options=df['city'].unique(), name='Select city: ')
city_selection = alt.selection_single(fields=['city'], bind=input_dropdown, init = default_city)

viz_cities = alt.Chart(cities.reset_index()).add_selection(
    city_selection
).transform_filter(
    city_selection
).mark_point().encode(
    y = alt.Y('%_of_total:Q', axis = None),
    color = alt.Color('category_name:N', legend=None),
    tooltip = [alt.Tooltip('category_name:N', title = 'Ethnic Category'), 
               alt.Tooltip('%_of_total:Q', format='.2%', title = 'Percentage of Total'),
               alt.Tooltip('total_business_count:Q', title = 'Count')]
).interactive( bind_x = False )

In [15]:
(viz_nyc | viz_cities).resolve_scale( y = 'shared' )

### Slope plot

In [16]:
selected_cities = ['New York, New York', 'Ann Arbor, Michigan']

In [17]:
cities_pair = cities.reset_index()
cities_pair = cities_pair[ cities_pair['city'].isin(selected_cities) ]
cities_pair.sample(5)

Unnamed: 0,city,category_name,total_business_count,sample_rating,sample_review_count,total_business_count_by_city,%_of_total
1665,"New York, New York",Austrian,8,4.125,298.625,13032,0.000614
79,"Ann Arbor, Michigan",Mongolian,6,3.166667,111.333333,697,0.008608
1661,"New York, New York",Arabian,5,3.9,51.6,13032,0.000384
80,"Ann Arbor, Michigan",Moroccan,3,4.666667,124.666667,697,0.004304
73,"Ann Arbor, Michigan",Latin American,5,4.3,98.0,697,0.007174


In [18]:
viz_cities_slope_circles = alt.Chart(cities_pair).mark_point(size = 40, filled = True, opacity = 1).encode(
    x = alt.X('city:N', sort = alt.Sort(selected_cities), axis = alt.Axis(labelAngle = 0)),
    y = alt.Y('%_of_total:Q', axis=alt.Axis(format='.2p', title = 'Percent of Total Businesses')),
    color = alt.Color('category_name:N', legend=None),
    tooltip = [alt.Tooltip('category_name:N', title = 'Ethnic Category'), 
               alt.Tooltip('%_of_total:Q', format='.2%', title = 'Percentage of Total'),
               alt.Tooltip('total_business_count:Q', title = 'Count')
               ]
).interactive( bind_x = False )

In [19]:
selection_opacity = alt.selection_single(encodings = ['y'], on='mouseover', clear="click", empty = 'none')
condition_opacity = alt.condition(selection_opacity,alt.value(1), alt.value(0.2))
condition_size = alt.condition(selection_opacity,alt.value(3), alt.value(2))

viz_cities_slope_line = alt.Chart(cities_pair).mark_line().add_selection(
    selection_opacity
).encode(
    x = alt.X('city:N', sort = alt.Sort(selected_cities), axis = alt.Axis(labelAngle = 0)),
    y = alt.Y('%_of_total:Q', axis=alt.Axis(format='.2p', title = 'Percent of Total Businesses')),
    color = alt.Color('category_name:N', legend=None),
    opacity = condition_opacity,
    size = condition_size,
    # shape = alt.value('circle'),
    tooltip = [alt.Tooltip('category_name:N', title = 'Ethnic Category'), 
               alt.Tooltip('%_of_total:Q', format='.2%', title = 'Percentage of Total'),
               alt.Tooltip('total_business_count:Q', title = 'Count')
               ]
).interactive( bind_x = False )

In [20]:
viz_cities_slope = (viz_cities_slope_line + viz_cities_slope_circles).properties(width = 600)
viz_cities_slope

## Food diversity index

Figure out with metrix, entropy or relative_entropy, best describe food diversity

In [116]:
for n in range(3, 6):
    print('n: %s' %n)
    mono = [0 for x in range(n-2)]+[0.5]+ [0.5]
    diverse = [1/n for x in range(n)]
    model_ent = entropy(diverse)
    ent = entropy(mono)
    relative_ent = entropy(mono, diverse)
    print(mono, ent)
    print(diverse, model_ent)
    print('relative entropy: ', relative_ent)
    print('\n')


n: 3
[0, 0.5, 0.5] 0.6931471805599453
[0.3333333333333333, 0.3333333333333333, 0.3333333333333333] 1.0986122886681096
relative entropy:  0.4054651081081644


n: 4
[0, 0, 0.5, 0.5] 0.6931471805599453
[0.25, 0.25, 0.25, 0.25] 1.3862943611198906
relative entropy:  0.6931471805599453


n: 5
[0, 0, 0, 0.5, 0.5] 0.6931471805599453
[0.2, 0.2, 0.2, 0.2, 0.2] 1.6094379124341005
relative entropy:  0.9162907318741551




Calculate entropy, relative entropy and simpson index for the distribution of each city

In [121]:
def simpson_di(data):
    def p(n, N):
        if n is  0:
            return 0
        else:
            return float(n)/N
    N = sum(data.values())
    return sum(p(n, N)**2 for n in data.values() if n is not 0)


In [122]:
diveristy_index_li = []

for city in df['city'].unique():
    indice = {'city': city}
    frequency = cities['%_of_total'][city].to_list()
    indice['n'] = len(frequency)
    indice['entropy'] = entropy(frequency)
    indice['relative_entropy'] = entropy(frequency, qk = [1/len(frequency) for x in frequency])
    indice['simpson_index'] = simpson_di( cities['%_of_total'][city].to_dict() )
    diveristy_index_li.append(indice)

diveristy_index = pd.DataFrame(diveristy_index_li)
diveristy_index.head(5)
    

Unnamed: 0,city,n,entropy,relative_entropy,simpson_index
0,"New York, New York",73,3.029888,1.260572,0.070566
1,"Los Angeles, California",67,2.814524,1.390169,0.107121
2,"Chicago, Illinois",65,2.80615,1.368237,0.101187
3,"Houston, Texas",62,2.817223,1.309911,0.117487
4,"Philadelphia, Pennsylvania",65,2.932636,1.241751,0.084764


In [119]:
diveristy_index.sort_values(by = 'entropy', ascending = False).head(10)

Unnamed: 0,city,n,entropy,relative_entropy
50,"Ann Arbor, Michigan",57,3.465528,0.577523
20,"Seattle, Washington",68,3.080336,1.139172
22,"Washington, District of Columbia",62,3.049377,1.077757
13,"San Francisco, California",70,3.033055,1.21544
0,"New York, New York",73,3.029888,1.260572
25,"Baltimore, Maryland",63,2.998831,1.144304
9,"San Jose, California",70,2.993414,1.255081
17,"Detroit, Michigan",56,2.972681,1.05267
44,"Oakland, California",72,2.969465,1.307202
43,"Miami, Florida",58,2.951283,1.10916


In [118]:
diveristy_index.sort_values(by = 'relative_entropy', ascending = True).head(10)

Unnamed: 0,city,n,entropy,relative_entropy
50,"Ann Arbor, Michigan",57,3.465528,0.577523
19,"Memphis, Tennessee",33,2.6035,0.893007
42,"Raleigh, North Carolina",47,2.898236,0.951911
12,"Jacksonville, Florida",49,2.919075,0.972745
17,"Detroit, Michigan",56,2.972681,1.05267
48,"Wichita, Kansas",29,2.308267,1.059029
15,"Charlotte, North Carolina",45,2.730734,1.075928
22,"Washington, District of Columbia",62,3.049377,1.077757
40,"Colorado Springs, Colorado",39,2.572741,1.090821
38,"Virginia Beach, Virginia",39,2.572543,1.091018


In [123]:
diveristy_index.sort_values(by = 'simpson_index', ascending = False).head(10)

Unnamed: 0,city,n,entropy,relative_entropy,simpson_index
18,"El Paso, Texas",30,1.98542,1.415777,0.282494
6,"San Antonio, Texas",49,2.251522,1.640298,0.229269
32,"Tucson, Arizona",37,2.221421,1.389497,0.194237
33,"Fresno, California",41,2.331392,1.382181,0.183102
5,"Phoenix, Arizona",51,2.462589,1.469237,0.178172
46,"Tulsa, Oklahoma",35,2.28386,1.271489,0.166345
48,"Wichita, Kansas",29,2.308267,1.059029,0.158476
26,"Oklahoma City, Oklahoma",39,2.433553,1.230008,0.148482
11,"Indianapolis, Indiana",45,2.485808,1.320854,0.148229
10,"Austin, Texas",55,2.608053,1.39928,0.148022


## Average popularity of ethnic cuisines on choropleth map

In [21]:
world = data.world_110m.url
world

'https://vega.github.io/vega-datasets/data/world-110m.json'

In [30]:
categories_li = list(df['category_name'].unique())
cities_li = list(df['city'].unique())
filler = []
for x in cities_li:
    for y in categories_li:
        filler.append({'city': x, 'category_name': y})
filler = pd.DataFrame(filler)
filler.head()

Unnamed: 0,city,category_name
0,"New York, New York",Afghan
1,"New York, New York",African
2,"New York, New York",Arabian
3,"New York, New York",Argentine
4,"New York, New York",Armenian


In [56]:
heatmap = pd.merge( filler, cities.reset_index()[['city','category_name', '%_of_total']], 
                    how = 'outer', 
                    left_on = ['city', 'category_name'],
                    right_on =['city', 'category_name'])\
            .fillna(0)\
            .groupby('category_name')['%_of_total'].mean()\
            .reset_index()\
            .rename(columns = {'%_of_total': 'Avg_%_of_total'})
heatmap.shape

(76, 2)

In [57]:
heatmap = pd.merge(heatmap, df[['M49_country_code', 'category_name', 'country']], 
                   how = 'left', left_on= 'category_name', right_on = 'category_name')\
        .drop_duplicates().reset_index().drop(columns = ['index'])
heatmap.shape

(77, 4)

In [58]:
heatmap.sample(1)

Unnamed: 0,category_name,Avg_%_of_total,M49_country_code,country
53,Nicaraguan,0.000581,558.0,Nicaragua


In [59]:
viz_mapsize = (800, 400)

In [60]:
viz_mapbase = alt.Chart(alt.topo_feature(world, 'countries')).mark_geoshape(
    fill='#eee',
    stroke='#fff'
).project(
    type='mercator', scale=160, translate = [375, 200]
).properties(
    width=viz_mapsize[0],
    height=viz_mapsize[1]
)

In [67]:
viz_choropleth = viz_mapbase.mark_geoshape(
    stroke='#fff', strokeWidth=0.25
).transform_lookup(
    lookup='id', from_=alt.LookupData(data=heatmap, key='M49_country_code', fields=['Avg_%_of_total', 'category_name', 'country'])
).encode(
    alt.Color('Avg_%_of_total:Q',
              scale=alt.Scale(clamp=True, scheme = 'tealblues'), 
              legend = alt.Legend(title = 'Average Percentage Per City', 
                                  direction = 'horizontal',
                                  orient = 'bottom-right',
                                  gradientLength = 200,
                                  titleAnchor='end',
                                  tickMinStep = 0.05, 
                                  format = '.0%'
                                  )),
    tooltip = [alt.Tooltip('country:N', title = 'Country'),
               alt.Tooltip('category_name:N', title = 'Ethnic Category'), 
               alt.Tooltip('Avg_%_of_total:Q', title = 'Average Percentage Per City', format = '.2%')
               ]
)

In [68]:
(viz_mapbase + viz_choropleth ).configure_view(stroke = '#fff')