# Filtering Stop Regions

In [1]:
import os
os.chdir("/home/tales/dev/master/mdc_analysis/")
print("working dir", os.getcwd())

working dir /home/tales/dev/master/mdc_analysis


In [161]:
import pandas as pd
import time

from src.dao import csv_dao
from src.dao import objects_dao
from src.plot.basic_plot import plot_result_multi_line, plot_result
from src.taxonomy.category_mapping import tags_to_categ

from bokeh.io import output_notebook, show
from bokeh.palettes import Category20c

In [27]:
output_notebook()

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Loading User Data

In [3]:
def minutes_hour(m):
    one_minute_h = 1 / 60
    return one_minute_h * m

In [4]:
filtereds = {}

last_time = time.time()

for m in [5, 10, 15, 20, 25, 30, 45, 60]:
    h = minutes_hour(m)
    tags_sequences = objects_dao.load_users_tags_sequence(sr_stay_time_above_h=h)["filtered"]
    categs_sequences_as_lists, categs_sequences = tags_to_categ(tags_sequences, verbose=False)

    filtereds[m] = categs_sequences
    
    print("{}m, {}h".format(m, h), len(filtereds[m]))
    print("load time:", round(time.time() - last_time, 2))
    print()
    last_time = time.time()

    

Loading Stop Region Group data
Building Stop Region Group sequence
5m, 0.08333333333333333h 155
load time: 605.97

Loading Stop Region Group data
Building Stop Region Group sequence
10m, 0.16666666666666666h 154
load time: 328.35

Loading Stop Region Group data
Building Stop Region Group sequence
15m, 0.25h 153
load time: 295.97

Loading Stop Region Group data
Building Stop Region Group sequence
20m, 0.3333333333333333h 153
load time: 281.12

Loading Stop Region Group data
Building Stop Region Group sequence
25m, 0.4166666666666667h 153
load time: 263.89

Loading Stop Region Group data
Building Stop Region Group sequence
30m, 0.5h 152
load time: 277.83

Loading Stop Region Group data
Building Stop Region Group sequence
45m, 0.75h 148
load time: 280.87

Loading Stop Region Group data
Building Stop Region Group sequence
60m, 1.0h 146
load time: 369.9



# Sizes

In [5]:
sizes = []

for stay_time in filtereds.keys():
    for user_id in filtereds[stay_time].keys():
        sizes.append({"stay_time": stay_time, "user_id": user_id, "seq_size": len(filtereds[stay_time][user_id])})

sizes = pd.DataFrame(sizes)

In [14]:
seq_medians = sizes.groupby("stay_time")["seq_size"].median()
seq_medians

stay_time
5     241.0
10    184.0
15    158.0
20    141.0
25    129.0
30    116.0
45     94.5
60     84.0
Name: seq_size, dtype: float64

In [117]:
p = plot_result(seq_medians.index.tolist(), 
                seq_medians.tolist(),  
                "Stop Region stay time", 
                "sequence size median", 
                color="darkblue",
                title="Sequence medians among users over some Stop Region stay times")
show(p)

# Diversity

In [113]:
diversities = []

for stay_time in filtereds.keys():
    for user_id in filtereds[stay_time].keys():
        diversities.append({"stay_time": stay_time, 
                            "user_id": user_id, 
                            "diversity_size": len(pd.Series(filtereds[stay_time][user_id]).drop_duplicates()),
                            "categs_diversity": pd.Series(filtereds[stay_time][user_id]).drop_duplicates().tolist(),
                            "sequence_size":  filtereds[stay_time][user_id],
                            "categs_sequence":  filtereds[stay_time][user_id]})

diversities = pd.DataFrame(diversities)

In [114]:
uni_medians = diversities.groupby("stay_time")["diversity_size"].median()
uni_medians

stay_time
5     22
10    20
15    19
20    18
25    17
30    16
45    15
60    14
Name: diversity_size, dtype: int64

In [118]:
p = plot_result(uni_medians.index.tolist(), 
                uni_medians.tolist(),  
                "Stop Region stay time", 
                "Diversity size MEDIAN", 
                color="darkgreen",
                title="Diversity size among users over some Stop Region stay times")
show(p)

# Frequency

In [180]:
categ_freq = diversities.groupby("stay_time")["categs_sequence"].sum().apply(lambda lista : pd.Series(lista).value_counts())
categ_rel_freq = diversities.groupby("stay_time")["categs_sequence"].sum().apply(lambda lista : round(pd.Series(lista).value_counts() * 100 / pd.Series(lista).value_counts().sum(), 2))
categ_rel_freq

Unnamed: 0_level_0,HOME,NoCategoryMatched,WORK,bakery,bar,beauty_salon,cafe,convenience_store,finance,general_contractor,grocery_or_supermarket,hair_care,health,laundry,lawyer,library,liquor_store,local_government_office,lodging,natural_feature,place_of_worship,political,real_estate_agency,restaurant,school,store,supermarket,transit_station,travel_agency
stay_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
5,13.33,4.32,12.65,0.67,1.53,3.57,1.96,1.3,2.82,1.73,0.85,1.18,7.44,0.15,0.56,0.39,1.57,0.36,1.8,0.02,1.94,2.34,1.89,9.83,2.87,13.5,1.67,6.98,0.8
10,15.55,4.58,14.46,0.6,1.47,3.85,1.48,0.93,2.53,1.78,0.71,1.11,7.12,0.15,0.59,0.35,1.44,0.33,1.83,0.03,2.11,2.35,1.83,9.51,2.86,11.89,1.6,6.33,0.6
15,16.43,4.6,15.43,0.58,1.43,3.98,1.32,0.84,2.41,1.77,0.63,1.07,7.02,0.15,0.61,0.34,1.41,0.33,1.81,0.03,2.11,2.36,1.84,9.3,2.82,11.25,1.54,6.09,0.51
20,17.24,4.66,16.31,0.57,1.38,4.12,1.31,0.7,2.29,1.73,0.5,1.03,6.82,0.13,0.58,0.3,1.33,0.32,1.79,0.03,2.09,2.4,1.8,9.13,2.81,10.72,1.44,6.01,0.47
25,17.75,4.67,16.97,0.55,1.34,4.2,1.31,0.64,2.22,1.72,0.47,0.98,6.66,0.12,0.57,0.28,1.31,0.31,1.76,0.03,2.09,2.39,1.78,8.94,2.83,10.44,1.38,5.86,0.45
30,18.51,4.75,17.83,0.52,1.23,4.25,1.32,0.57,2.09,1.71,0.43,0.88,6.41,0.11,0.55,0.21,1.31,0.28,1.82,0.03,2.05,2.42,1.77,8.68,2.8,9.94,1.31,5.82,0.41
45,20.3,4.59,19.86,0.46,1.0,4.32,1.24,0.53,1.97,1.58,0.4,0.67,5.9,0.11,0.47,0.15,1.33,0.26,1.75,0.02,1.93,2.41,1.71,8.09,2.76,9.06,1.22,5.54,0.35
60,21.4,4.62,21.07,0.42,0.91,4.4,1.16,0.5,1.87,1.49,0.36,0.63,5.57,0.1,0.35,0.1,1.29,0.23,1.68,0.01,1.89,2.38,1.76,7.58,2.76,8.53,1.19,5.43,0.33


In [182]:
changes = (categ_rel_freq.loc[60] - categ_diversity_rel_freq.loc[5]).sort_values(ascending=False)
changes

WORK                       8.42
HOME                       8.07
beauty_salon               0.83
NoCategoryMatched          0.30
political                  0.04
natural_feature           -0.01
laundry                   -0.05
place_of_worship          -0.05
school                    -0.11
lodging                   -0.12
real_estate_agency        -0.13
local_government_office   -0.13
lawyer                    -0.21
general_contractor        -0.24
bakery                    -0.25
liquor_store              -0.28
library                   -0.29
travel_agency             -0.47
supermarket               -0.48
grocery_or_supermarket    -0.49
hair_care                 -0.55
bar                       -0.62
convenience_store         -0.80
cafe                      -0.80
finance                   -0.95
transit_station           -1.55
health                    -1.87
restaurant                -2.25
store                     -4.97
dtype: float64

In [183]:
head = changes.head(3).index.tolist()
head

['WORK', 'HOME', 'beauty_salon']

In [184]:
tail = changes.tail(3).index.tolist()
tail

['health', 'restaurant', 'store']

In [185]:
print(len(changes))
print(len(changes) / 2)
middle = changes.iloc[[13,14,15]].index.tolist()
middle

29
14.5


['general_contractor', 'bakery', 'liquor_store']

In [188]:
colors = [Category20c[20][0], Category20c[20][1], Category20c[20][2],
          Category20c[20][4], Category20c[20][5], Category20c[20][6],
          Category20c[20][8], Category20c[20][9], Category20c[20][10]]

In [196]:
#blue, orange, green, purple
categs = head + middle + tail

p = plot_result_multi_line(xs_list=[categ_diversity_rel_freq.index.tolist()] * len(categs), 
                           ys_list=[categ_diversity_rel_freq[categ].tolist() for categ in categs], 
                           x_label="Stop Region stay time", 
                           y_label="Relative frequency of categ", 
                           color_list=colors, 
                           legend_list=categs, 
                           title="P",
                           width=900,
                           height=500)

show(p)