# Compare two datasets

Let's compare two pedestrian estimations with different methods: field count vs. cnn estimation. 

In [110]:
import pandas as pd
import statsmodels.api as sm
import altair as alt
import scipy
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.graphics.api as smg
import itertools

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [111]:
df = pd.read_csv("../data/dataframe/df_count.csv")
df = df.set_index('loc')
df

Unnamed: 0_level_0,field_am_1y,field_md_1y,field_pm_1y,field_all_1y,field_am_3y,field_md_3y,field_pm_3y,filed_all_3y,cnn_count,cnn_sum,mayor_index,borough
loc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1271.0,2933.0,4483.0,3220.6,1370.6,2814.833333,4512.666667,2989.294118,43.0,58.994938,N,Bronx
2,1749.0,1880.0,4935.5,3076.0,2029.6,2265.666667,5158.166667,3217.117647,5.0,11.921850,Y,Bronx
3,2209.0,6639.0,9282.5,6810.4,2502.2,7508.333333,10915.833333,7238.588235,97.0,166.952693,Y,Bronx
4,1648.0,1508.0,2787.0,2047.6,2061.8,1483.500000,2974.666667,2179.882353,8.0,17.379551,N,Bronx
5,1016.0,2513.0,3771.0,2716.8,1034.4,2504.000000,3588.833333,2454.647059,39.0,67.678092,N,Bronx
...,...,...,...,...,...,...,...,...,...,...,...,...
110,428.0,195.5,505.5,366.0,355.6,227.000000,497.166667,360.176471,1.0,2.194373,N,Harlem River Bridges
111,91.0,132.0,379.5,222.8,181.8,187.000000,354.333333,244.529412,4.0,7.764933,N,Harlem River Bridges
112,111.0,181.5,209.0,178.4,164.2,161.500000,291.333333,208.117647,3.0,6.747879,N,Harlem River Bridges
113,6.0,5.0,14.5,9.0,12.0,11.333333,24.500000,16.176471,1.0,3.434210,N,Harlem River Bridges


## Correlation matrix
let's make correlation matrix between field counts * cnn counts

In [112]:
counts_cols = list(df.filter(regex='field_(am|md|pm)_3y').columns)
cnn_cols = list(df.filter(regex='cnn_(count|sum)').columns)
corrs = []
for pair in itertools.product(counts_cols, cnn_cols):
    record = {}
    record['field'] = pair[0]
    record['cnn'] = pair[1]
    record['corr'] = scipy.stats.pearsonr(df[pair[0]], df[pair[1]])[0]
    corrs.append(record)
df_corr = pd.DataFrame(corrs)
df_corr

Unnamed: 0,field,cnn,corr
0,field_am_3y,cnn_count,0.674536
1,field_am_3y,cnn_sum,0.685109
2,field_md_3y,cnn_count,0.8177
3,field_md_3y,cnn_sum,0.824872
4,field_pm_3y,cnn_count,0.814208
5,field_pm_3y,cnn_sum,0.823816


In [122]:
def draw_regplot(df, col1, col2):
    chart = alt.Chart(df).mark_circle().encode(
        alt.X(col1),
        alt.Y(col2),
        # alt.Color('borough', type='nominal')
    ).properties(width=300, height=300)

    line = chart.transform_regression(
        col1, 
        col2
    ).mark_line()
    
    corr = scipy.stats.pearsonr(df[col1], df[col2])[0]
    params = chart.mark_text(
        align='left',  
        text= f'Pearson\'s r = {corr:.3f}',
        # color='#000000'
    ).encode(
        x=alt.value(20),  # pixels from left
        y=alt.value(20),  # pixels from top
    )

    return chart  +  params + line

In [123]:
(draw_regplot(df, 'cnn_sum', 'field_pm_3y') | draw_regplot(df, 'cnn_count', 'field_pm_3y')) \
& \
(draw_regplot(df, 'cnn_sum', 'field_md_3y') | draw_regplot(df, 'cnn_count', 'field_md_3y')) \
& \
(draw_regplot(df, 'cnn_sum', 'field_am_3y') | draw_regplot(df, 'cnn_count', 'field_am_3y'))

## Comparison across aggregation methods


In [143]:
alt.Chart(df_corr).mark_bar().encode(
    x='cnn:N',
    y='corr:Q',
    color='cnn:N',
    column='field:N'
).properties(width=100, height=300)

## Region

In [133]:
def draw_regplot_c(df, col1, col2):
    chart = alt.Chart(df).mark_circle().encode(
        alt.X(col1),
        alt.Y(col2),
        alt.Color('borough', type='nominal')
    ).properties(width=300, height=300)

    line = chart.transform_regression(
        col1, 
        col2
    ).mark_line()
    
    corr = scipy.stats.pearsonr(df[col1], df[col2])[0]
    params = chart.mark_text(
        align='left',  
        text= f'Pearson\'s r = {corr:.3f}',
        # color='#000000'
    ).encode(
        x=alt.value(20),  # pixels from left
        y=alt.value(20),  # pixels from top
    )

    return chart  +  params + line
    chart = alt.Chart(df).mark_circle().encode(
        alt.X(col1),
        alt.Y(col2),
        color='borough:N'
    ).properties(width=300, height=300)

    line = chart.transform_regression(
        col1, 
        col2
    ).mark_line()
    
    corr = scipy.stats.pearsonr(df[col1], df[col2])[0]
    params = chart.mark_text(
        align='left',  
        text= f'Pearson\'s r = {corr:.3f}',
        # color='#000000'
    ).encode(
        x=alt.value(20),  # pixels from left
        y=alt.value(20),  # pixels from top
    )

    return chart  +  params + line

In [140]:
draw_regplot_c(df, 'cnn_sum', 'field_md_3y')

Outliers are usually in Manhattan.

### Without Manhattan

In [139]:
draw_regplot_c(df[df['borough']!='Manhattan'], 'cnn_sum', 'field_md_3y')