# Assignment 3 

# Table of Content
## Overview
1. Where is 307?

## Data Exploration
1. People's Behavior in terms of Dwell Time 
2. Which areas of 307 do people pass through
3. Where do people tend to linger?
4. How does dwell time change over time?

## In-depth Analysis
1. How do different zones affect people's behavior?
2. How do events affect people's behavior?
3. What is the best maintenance strategy?
4. What are other factor affect people's bahavior?

# About 307

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg 

In [2]:
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
#init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)

# Extra options
# pd.options.display.max_rows = 30
# pd.options.display.max_columns = 25

# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import os
from IPython.display import Image, display, HTML

In [3]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [4]:
# store login data in login.py
%run login.py

In [5]:
# login query as multiline formatted string
# this assumes that login and pwd are defined 
# above

loginquery = f"""
mutation {{
  logIn(
      email:\"{login}\",
      password:\"{pwd}\") {{
    jwt {{
      token
      exp
    }}
  }}
}}
"""

In [6]:
import requests
url = 'https://api.numina.co/graphql'

mylogin = requests.post(url, json={'query': loginquery})
# mylogin

In [7]:
token = mylogin.json()['data']['logIn']['jwt']['token']

In [8]:
expdate = mylogin.json()
# expdate

# Explore the Data!

Now that you've been provided with the context, before we present our analysis, it's time for YOU to explore the data! As mentioned, the following are the full areas covered by the three cameras:

Streetscape | Under Raincoat | Outside
------------- | -------------  | -------------
![alt](streetscape_sandbox.png) | ![alt](underraincoat_sandbox.png) | ![alt](outside_sandbox.png)

As you see in the above images, each area essentially consists of two parts: objects such as tables and chairs, and empty spaces presumably for walking. Based on this reasoning, we have defined the following smaller behaviour zones so as to perform more in-depth research:

### Streetscape ###

Chair Zone | Path Zone | Activity Zone
------------- | -------------  | -------------
![alt](BehaviorZoneImage/Streetscape-ChairZone.png) | ![alt](BehaviorZoneImage/Streetscape-PathZone.png) | ![alt](BehaviorZoneImage/Streetscape-ActivityZone.png)

### Under Raincoat ###

Chair Zone | Traffic Zone | Acitivity Zone
------------- | -------------  | -------------
![alt](BehaviorZoneImage/UnderRaincoat-ChairZone.png) | ![alt](BehaviorZoneImage/UnderRaincoat-TrafficZone.png) | ![alt](BehaviorZoneImage/UnderRaincoat-ActivityZone.png)

### Outside ###

Chair Zone | Path Zone | -
------------- | -------------  | -------------
![alt](BehaviorZoneImage/Outside-ChairZone.png) | ![alt](BehaviorZoneImage/Outside-PathZone.png) | ![alt](blank.png)

In [9]:
os.listdir('BehaviorZoneImage/')

['Outside-ChairZone.png',
 'Outside-PathZone.png',
 'Streetscape-ActivityZone.png',
 'Streetscape-ChairZone.png',
 'Streetscape-PathZone.png',
 'UnderRaincoat-ActivityZone.png',
 'UnderRaincoat-ChairZone.png',
 'UnderRaincoat-TrafficZone.png']

In [10]:
device_dict = {'SWLSANDBOX1':'Streetscape', 'SWLSANDBOX2':'Under Raincoat', 'SWLSANDBOX3':'Outside'}
device_ids = list(device_dict.keys())
device_names = list(device_dict.values())

In [11]:
def get_zones(device_id):
    
    query_zones = """
    query {{
      behaviorZones (
        serialnos: "{0}"
        ) {{
        count
        edges {{
          node {{
            rawId
            text
          }}
        }}
      }}
    }}
    """.format(device_id)
    
    zones = requests.post(url, json={'query': query_zones}, headers = {'Authorization':token})
    
    df = pd.DataFrame([x['node'] for x in zones.json()['data']['behaviorZones']['edges']])
    df['device'] = device_id
    
    return df

In [12]:
zones_df = pd.concat([get_zones(device_ids[i]) for i in range(3)])
zones_df = zones_df[(zones_df.text.notnull()) & (zones_df.text.str.startswith('x-'))]

In [13]:
zones_df['text'] = zones_df['text'].str.replace('x-', '')
zones_df

Unnamed: 0,rawId,text,device
3873,14895,streetscape-corridorzone,SWLSANDBOX1
3874,14896,streetscape-chairzone,SWLSANDBOX1
3875,14897,streetscape-freezone,SWLSANDBOX1
3925,14898,underraincoat-trafficzone,SWLSANDBOX2
3926,14899,underraincoat-freezone,SWLSANDBOX2
3927,14900,underraincoat-chairzone1,SWLSANDBOX2
3928,14901,underraincoat-chairzone2,SWLSANDBOX2
3929,14902,underraincoat-chairzone3,SWLSANDBOX2
3994,14893,outside-chairzone,SWLSANDBOX3
3995,14894,outside-pathzone,SWLSANDBOX3


In [14]:
def get_dwell(func, ID, interval):
    '''
    func is either feedDwellTimeDistribution or zoneDwellTimeDistribution
    '''
    if func == 'feedDwellTimeDistribution':
        arg = 'serialnos: "{0}"'.format(ID)
    else:
        arg = 'zoneIds: {0}'.format(ID)
        
    query = """
    query {{
        {0}(
        {1},
        startTime: "2019-02-20T00:00:00",
        endTime: "2020-01-12T00:00:00",
        timezone: "America/New_York",
        objClasses: ["pedestrian"],
        interval: "{2}"
        ){{
        edges {{
          node {{
            time
            objClass
            pct100
            pct75
            pct50
            pct25
            mean
            count
          }}
        }}
      }}
    }}
    """.format(func, arg, interval)

    dwell = requests.post(url, json={'query': query}, 
                           headers = {'Authorization':token})
    
    df = pd.DataFrame([x['node'] for x in dwell.json()['data'][func]['edges']])
    if func == 'feedDwellTimeDistribution':
        df['device'] = ID
    else:
        df['zone'] = ID
    
    return df

In [15]:
feed_dwell_1d_df = pd.concat([get_dwell('feedDwellTimeDistribution', device_ids[i], '1d') 
                              for i in range(3)])

In [16]:
# feed_dwell_1d_df[feed_dwell_1d_df['count']!=0].head()

In [17]:
zone_dwell_1d_df = pd.concat([get_dwell('zoneDwellTimeDistribution', z, '1d')
                             for z in zones_df['rawId'].values])

In [18]:
## zone_dwell_1d_df[zone_dwell_df['count']!=0]

In [19]:
'''
def extract_time(df):
    df['year'] = df['time'].str[:4].astype(int)
    df['month'] = df['time'].str[5:7].astype(int)
    df['day'] = df['time'].str[8:10].astype(int)
    df['date'] = pd.to_datetime(df['time'].str[:10])
    df['hour'] = df['time'].str[11:13].astype(int)
    return df.drop('time', axis=1)
''';

In [20]:
'''
feed_dwell_df = extract_time(feed_dwell_df)
zone_dwell_df = extract_time(zone_dwell_df)
''';

In [21]:
# replace NaN with 0
feed_dwell_1d_df = feed_dwell_1d_df.fillna(0)
zone_dwell_1d_df = zone_dwell_1d_df.fillna(0)

In [22]:
feed_dwell_1d_df['time'] = feed_dwell_1d_df['time'].str[:-6].apply(lambda x : pd.Timestamp(x))
zone_dwell_1d_df['time'] = zone_dwell_1d_df['time'].str[:-6].apply(lambda x : pd.Timestamp(x))
zone_dwell_1d_df.zone = zone_dwell_1d_df.zone.astype(str)

In [23]:
# add name column in addition to ID
feed_dwell_1d_df['device_name'] = [device_dict[d] for d in feed_dwell_1d_df.device]

zones_df.rawId = zones_df.rawId.astype(str)
zone_dict = dict(zip(zones_df.rawId, zones_df.text))
zone_dwell_1d_df['zone_name'] = [zone_dict[z] for z in zone_dwell_1d_df.zone]

In [40]:
def get_df(groupby):
    if groupby == 'device_name':
        return feed_dwell_1d_df.copy()
    else:
        return zone_dwell_1d_df.copy()

In [25]:
# streetscape, under raincoat, outside
device_clrs = ['royalblue', 'firebrick', 'forestgreen']

In [53]:
zones_df['colour'] = ['blue', 'lightblue', 'cadetblue',
                      'orangered', 'darksalmon', 'lemonchiffon', 'peachpuff', 'lightcoral', 
                      'palegreen', 'lightgreen']

In [49]:
def plot_timeline(groupby, metric):
    '''
    device_or_zone is either 'device_name' or 'zone_name';
    metric is a value in ['mean', 'pct100', 'pct75', 'pct50', 'pct25']
    '''
    df = get_df(groupby)
    
    fig = go.Figure()
    
    if groupby == 'device_name':
        byvals = device_names
        clrs = device_clrs
    else:
        byvals = list(zones_df.text)
        clrs = list(zones_df.colour)
    
    for i in range(len(byvals)):
        sub_df = df[df[groupby] == byvals[i]]
        fig.add_trace(go.Scatter(x=sub_df.time, y=sub_df[metric], line_color=clrs[i], name=byvals[i]))
    
    fig.update_layout(
        xaxis_title="time",
        yaxis_title=metric,
        xaxis_rangeslider_visible=True
    )
    if metric != 'count':
        fig.update_layout(title=f"pedestrian dwell time ({metric}) grouped by '{groupby}'")
    else:
        fig.update_layout(title=f"pedestrian count grouped by '{groupby}'")
    
    fig.show()
    

In [55]:
_ = interact(plot_timeline, 
             groupby=widgets.RadioButtons(options=['device_name', 'zone_name'], value='device_name'),
             metric=widgets.Dropdown(options=['count', 'mean', 'pct100', 'pct75', 'pct50', 'pct25'], value='mean')
            )

interactive(children=(RadioButtons(description='groupby', options=('device_name', 'zone_name'), value='device_…

The days and devices with the highest mean dwell time are the following:
1. Streetscape: 2019-08-17
2. Streetscape: 2019-05-04
3. Outside: 2019-11-14

In [56]:
feed_dwell_1d_df[feed_dwell_1d_df['mean'] >= 140].sort_values('mean', ascending=False)

Unnamed: 0,count,mean,objClass,pct100,pct25,pct50,pct75,time,device,device_name
178,75,343.66,pedestrian,36.02,3.65,3.65,10.36,2019-08-17,SWLSANDBOX1,Streetscape
73,128,340.78,pedestrian,46.36,6.8,6.8,15.13,2019-05-04,SWLSANDBOX1,Streetscape
267,605,145.88,pedestrian,13.49,3.67,3.67,6.25,2019-11-14,SWLSANDBOX3,Outside


In [31]:
feed_dwell_1d_df.groupby('device')['count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
device,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SWLSANDBOX1,326.0,1161.288344,1552.170562,0.0,111.5,615.0,1721.75,14922.0
SWLSANDBOX2,326.0,129.0,201.519001,0.0,0.0,76.0,180.5,2047.0
SWLSANDBOX3,326.0,311.496933,470.222891,0.0,56.25,196.0,414.5,5625.0


In [60]:
zone_dwell_1d_df['total'] = zone_dwell_1d_df['mean'] * zone_dwell_1d_df['count'] 
feed_dwell_1d_df['total'] = feed_dwell_1d_df['mean'] * feed_dwell_1d_df['count'] 

In [76]:
grouped_df = zone_dwell_1d_df.groupby('zone_name').sum().reset_index(drop=False)\
                             .rename(columns={'zone_name':'name'})
grouped_df = grouped_df.append(feed_dwell_1d_df.groupby('device_name').sum().reset_index(drop=False)
                               .rename(columns={'device_name':'name'}))

In [78]:
[s[-1] for s in grouped_df.name.str.split('-')]
[n[1:5]=='utsi' for n in grouped_df.name]

['chairzone',
 'pathzone',
 'chairzone',
 'corridorzone',
 'freezone',
 'chairzone1',
 'chairzone2',
 'chairzone3',
 'freezone',
 'trafficzone',
 'Outside',
 'Streetscape',
 'Under Raincoat']

[True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False]

In [129]:
from plotly.subplots import make_subplots

def plot_barplot(metric):
    '''
    metric is either 'count' or 'total' (dwell time)
    '''
    fig = make_subplots(rows=1, cols=3, shared_yaxes=True)
    
    df = grouped_df.copy()
    m = metric.split(' ')[0]
    
    for i in range(3):
        dname = device_names[i]
        total = df.loc[df.name==dname, m]
        sub_df = df[[n[1:5]==dname[1:5] for n in df.name]]
        sub_df.name = [s[-1] for s in sub_df.name.str.split('-')]
        sub_df['perc'] = sub_df[m].apply(lambda x : x / total * 100)
        
        fig.add_bar(x=sub_df.name, y=sub_df['perc'], name=dname, row=1, col=i+1)
        
    fig.update_yaxes(ticksuffix="%", col=1)
    layout = go.Layout(yaxis=dict(range=[0, 100]))
    
    fig.update_layout(title=f"% of individual behaviour zones w.r.t. the big area in terms of {metric}")

    fig.show()

In [132]:
_ = interact(plot_barplot, metric=widgets.RadioButtons(options=['count', 'total dwell time'], value='count'))

interactive(children=(RadioButtons(description='metric', options=('count', 'total dwell time'), value='count')…

In [32]:
def boxplot_dwell(groupby, column, bound_factor):
    df = get_df(groupby)
    
    q3 = df[column].quantile(0.75) 
    q1 = df[column].quantile(0.25)
    iqr = q3 - q1
    sub_df = df[(df[column] <= q3 + iqr*bound_factor) & 
                  ((df[column] >= q1 - iqr*bound_factor))]
    
    if column == 'count':
        title = f"distribution of count grouped by '{groupby}'" +\
        f" with values {bound_factor} * IQR beyond Q1/Q3 removed"
    else:
        title = f"distribution of mean dwell time grouped by '{groupby}'" +\
        f" with values {bound_factor} * IQR beyond Q1/Q3 removed"
    
    fig = px.box(sub_df, x=groupby, y=column, points="all", title=title)

    fig.show()

In [33]:
_ = interact(boxplot_dwell, 
             groupby=widgets.RadioButtons(options=['device_name', 'zone_name']), value='device_name',
             column=widgets.RadioButtons(options=['count', 'mean'], value='count'),
             bound_factor=widgets.FloatSlider(
                 value=1.5,
                 min=-3,
                 max=10,
                 step=0.1,
                 disabled=False,
                 continuous_update=False,
                 orientation='horizontal',
                 readout=True,
                 readout_format='.1f')
            )

interactive(children=(RadioButtons(description='groupby', options=('device_name', 'zone_name'), value='device_…

### Obtain heatmap for pedestrians

In [34]:
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
import calendar
START_DATE = datetime(2019, 2, 20, 0, 0, 0)
END_DATE = datetime(2019, 3, 20, 0, 0, 0)
time_delta = relativedelta(days = +1)

In [35]:
import pandas as pd
heatmap_df = pd.DataFrame(columns = ['startTime', 'endTime', 'heatMap'])

In [36]:
def heatmap_query_gen(startTime: str, endTime: str):
    heatmap_query = """
query {{
  feedHeatmaps(
    serialno: "SWLSANDBOX1",
    startTime:"{0}",
    endTime:"{1}",
    objClasses:["pedestrian"],
    timezone:"America/New_York") {{
    edges {{
      node {{
        time
        objClass
        heatmap
      }}
    }}
  }}
}}
""".format(startTime, endTime)
    return heatmap_query

In [37]:
current_date = START_DATE
while current_date < END_DATE:
    start_time_str = current_date.strftime('%Y-%m-%dT%H:%M:%S')
    end_time = current_date + time_delta
    end_time_str = end_time.strftime('%Y-%m-%dT%H:%M:%S')
    heatmap_data = requests.post(url, json={'query': heatmap_query_gen(start_time_str, end_time_str)}, 
                         headers = {'Authorization':token})
    heatmap_json = heatmap_data.json()
    if heatmap_json['data']:
        if 'feedHeatmaps' in heatmap_json['data']:
            heatmap = heatmap_json['data']['feedHeatmaps']['edges'][0]['node']['heatmap']
            temp_df = pd.DataFrame({"startTime":current_date, "endTime":end_time, 'heatMap':heatmap})
            heatmap_df = heatmap_df.append(temp_df, ignore_index = True)
    current_date = current_date + time_delta

In [38]:
ed_heatmap_df = heatmap_df.groupby(['startTime', 'endTime'])['heatMap'].apply(list).reset_index(name='heatMapMatrix')

In [39]:
from IPython.display import display
def plot_heatmap(start_time):
    map_img = mpimg.imread('streetscape_sandbox.png')
    matrix = list(ed_heatmap_df[ed_heatmap_df['startTime'] == start_time]['heatMapMatrix'])[0]
    x = [i[0] for i in matrix] 
    y = [i[1] for i in matrix]
    z = [i[2] for i in matrix]
    fig, ax = plt.subplots(figsize=(15,10))
    ax.scatter(x, y, c=z, s=10, cmap=plt.cm.Wistia) # Other color maps: plt.cm.cmap_d.keys())
    ax.imshow(map_img, aspect='auto')
    plt.axis('off')
    plt.title("Heatmap for date {0}".format(start_time, fontsize=20))
    plt.show()
interact(plot_heatmap, start_time=widgets.DatePicker(value = pd.to_datetime('2019-02-26'), description='Pick a Date'))

interactive(children=(DatePicker(value=Timestamp('2019-02-26 00:00:00'), description='Pick a Date'), Output())…

<function __main__.plot_heatmap(start_time)>