# Using CategoryMapper (Google Places)

In [1]:
import os

os.chdir("/home/tales/dev/master/mdc_analysis/")
print("working dir", os.getcwd())

import pandas as pd
import matplotlib.pyplot as plt
import json
import urllib.request

from src.poi_grabber import google_places
from src.dao import csv_dao, objects_dao
from src.entity.stop_region import StopRegionGroup

from src.utils.type_hierarchy_analysis import parent, children, relations_freq, occurrences_in_visited_pois, occurrences_in_all_pois

working dir /home/tales/dev/master/mdc_analysis


In [2]:
def get_users_tags_sequence():
    url = "http://127.0.0.1:5000/stop_regions_group"
    response = urllib.request.urlopen(url)
    return json.loads(response.read())

def load_users_tags_sequence():
    with open('outputs/users_tags_sequence.json') as json_file:
        return json.load(json_file)
    
users_tags_sequence = load_users_tags_sequence()

In [3]:
# users = os.listdir("outputs/stop_regions/")
# users.reverse()

# for user_id in users:
#     print("Loading user {} data".format(user_id))
#     users_tags_sequence[user_id] = StopRegionGroup(csv_dao.stop_region_sequence(user_id), 
#                                                     agglutinate_stop_regions=True).sequence_stop_region_tags()["tag"].tolist()

In [4]:
pois = google_places.load_all_google_places_data(valid_pois=True)

In [5]:
pois_freq = pd.read_csv("outputs/taxonomy/google_places/pois_frequencies.csv").set_index("type")
pois_freq.head()

Unnamed: 0_level_0,index,freq_all,freq_visited,prop_all,prop_visited
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
store,0,11807,9864,0.282,0.201
food,1,7873,8970,0.188,0.183
restaurant,2,4831,5327,0.116,0.109
health,3,8089,4132,0.193,0.084
transit_station,4,1691,3407,0.04,0.07


# Automatic Category Hierarchy

### Relations

In [6]:
relations_all_freq = relations_freq(pois)

In [7]:
relations_all_freq.head(12)

Unnamed: 0,parent,category,freq
0,.,establishment,40876
213,establishment,point_of_interest,40597
570,point_of_interest,store,11508
571,point_of_interest,health,6806
572,point_of_interest,food,4739
239,food,restaurant,4501
663,restaurant,,3779
739,store,food,2888
573,point_of_interest,finance,2872
340,health,,2776


In [8]:
print(occurrences_in_visited_pois("parking", users_tags_sequence))
print(occurrences_in_all_pois("parking", pois["types"]))

{'freq': 631, 'prop': 0.013}
{'freq': 412, 'prop': 0.01}


In [9]:
print(occurrences_in_visited_pois("natural_feature", users_tags_sequence))
print(occurrences_in_all_pois("natural_feature", pois["types"]))

{'freq': 11, 'prop': 0.0}
{'freq': 7, 'prop': 0.0}


# Using CategoryMapper

In [10]:
from src.taxonomy.category_mapping import CategoryMapper
from src.exceptions.exceptions import NoCategoryMatched
from src.utils.others import remove_list_elements

In [11]:
categ_mapper = CategoryMapper()

In [12]:
len(pois)

41819

## Mapping each POI from <font color="red">all</font> POIs

In [14]:
mapped_types = []
categ_not_found = []

counter = 0

for i, poi in pois.iterrows():
    counter += 1
    if counter % 5000 == 0 or counter == len(pois):
        print("{}%".format(round(counter * 100 / len(pois), 2)))
    
    types = remove_list_elements(poi["types"], elements=['premise', 'point_of_interest', 'establishment'])
        
    try:
        category = categ_mapper.map_categ(types, method="most_specific", logs=False)
        mapped_types.append({"types": types, "category": category})
        
    except NoCategoryMatched:
        mapped_types.append({"types": types, "category": "NoCategoryMatched"})
        categ_not_found.append(types)

11.96%
23.91%
35.87%
47.83%
59.78%
71.74%
83.69%
95.65%
100.0%


In [15]:
print(len(mapped_types))
print(len(categ_not_found))

41819
815


In [16]:
print("Types not mapped to any category")
pd.Series(categ_not_found).astype(str).value_counts()

Types not mapped to any category


['park']                            212
['shopping_mall']                   134
['car_rental']                      100
['movie_theater']                    94
['funeral_home']                     58
['cemetery']                         53
['police']                           47
['car_wash']                         36
['moving_company']                   19
['roofing_contractor']               16
['amusement_park']                   14
['stadium']                          13
['airport']                           9
['courthouse']                        5
['car_rental', 'moving_company']      2
['movie_rental']                      1
['aquarium']                          1
['route']                             1
dtype: int64

In [17]:
import ast 

for types in pd.Series(categ_not_found).astype(str).value_counts().index:
    print("\n-----")
    print("types:", types)
    print()
    
    for a_type in ast.literal_eval(types):
        print(">>>", a_type)
        print("visited :", occurrences_in_visited_pois(a_type, users_tags_sequence))
        print("all     :", occurrences_in_all_pois(a_type, pois["types"]))

        print("PARENT")
        print(parent(a_type, relations_all_freq))
        print("\nCHILDREN")
        print(children(a_type, relations_all_freq))
        print()

#parent("point_of_interest")


-----
types: ['park']

>>> park
visited : {'freq': 591, 'prop': 0.012}
all     : {'freq': 260, 'prop': 0.006}
PARENT
                 parent category  freq      prop
595   point_of_interest     park   175  0.673077
648             premise     park    44  0.169231
463             lodging     park    23  0.088462
301  general_contractor     park     5  0.019231
770               store     park     4  0.015385
683          restaurant     park     2  0.007692
283     furniture_store     park     2  0.007692
425             laundry     park     2  0.007692
698             rv_park     park     1  0.003846
101                cafe     park     1  0.003846
267                food     park     1  0.003846

CHILDREN
    parent        category  freq      prop
527   park            NULL   220  0.846154
528   park      campground    21  0.080769
530   park   travel_agency     4  0.015385
529   park         premise     4  0.015385
532   park         florist     2  0.007692
531   park  amusement_park

331                gym  amusement_park     1  0.05

CHILDREN
            parent    category  freq  prop
10  amusement_park        NULL    16  0.80
11  amusement_park  campground     2  0.10
13  amusement_park         zoo     1  0.05
12  amusement_park      museum     1  0.05


-----
types: ['stadium']

>>> stadium
visited : {'freq': 636, 'prop': 0.013}
all     : {'freq': 13, 'prop': 0.0}
PARENT
                parent category  freq      prop
625  point_of_interest  stadium     8  0.615385
651            premise  stadium     5  0.384615

CHILDREN
      parent category  freq  prop
730  stadium     NULL    13   1.0


-----
types: ['airport']

>>> airport
visited : {'freq': 8, 'prop': 0.0}
all     : {'freq': 13, 'prop': 0.0}
PARENT
                parent category  freq      prop
624  point_of_interest  airport     9  0.692308
729                spa  airport     1  0.076923
812      travel_agency  airport     1  0.076923
825         university  airport     1  0.076923
53                 bar

#  Mapping User 6189

Ver os casos de <b>store</b>

In [18]:
tags_sequence = users_tags_sequence["6189"]
categ_sequence = []

for tags in tags_sequence:
    if tags == ["WORK"] or tags == ["HOME"]:
        categ_sequence.append(tags)

    else:
        try:
            categ = categ_mapper.map_categ(tags, logs=True)
            categ_sequence.append([categ])

        except NoCategoryMatched:
            categ_sequence.append("NoCategoryMatched") 

for i in range(len(categ_sequence)):
    print(categ_sequence[i], "<<<", users_tags_sequence["6189"][i])


---
TYPES : ['bus_station', 'transit_station']

MAPPED: ['transit_station', 'transit_station']
most_frequent: ['transit_station']
most_specific: transit_station
chosen method: most_specific

---
TYPES : ['dentist', 'health', 'health']

MAPPED: ['health', 'health', 'health']
most_frequent: ['health']
most_specific: health
chosen method: most_specific

---
TYPES : ['bus_station', 'transit_station']

MAPPED: ['transit_station', 'transit_station']
most_frequent: ['transit_station']
most_specific: transit_station
chosen method: most_specific

---
TYPES : ['finance', 'finance', 'health', 'general_contractor']

MAPPED: ['finance', 'finance', 'health', 'general_contractor']
most_frequent: ['finance']
most_specific: finance
chosen method: most_specific

---
TYPES : ['insurance_agency', 'health']

MAPPED: ['health', 'health']
most_frequent: ['health']
most_specific: health
chosen method: most_specific

---
TYPES : ['insurance_agency']

MAPPED: ['health']
most_frequent: ['health']
most_specific:

---
TYPES : ['travel_agency']

MAPPED: ['travel_agency']
most_frequent: ['travel_agency']
most_specific: travel_agency
chosen method: most_specific

---
TYPES : ['meal_delivery', 'meal_takeaway', 'restaurant', 'food']

MAPPED: ['restaurant', 'restaurant', 'store']
most_frequent: ['restaurant']
most_specific: restaurant
chosen method: most_specific

---
TYPES : ['general_contractor']

MAPPED: ['general_contractor']
most_frequent: ['general_contractor']
most_specific: general_contractor
chosen method: most_specific

---
TYPES : ['locksmith']

MAPPED: ['store']
most_frequent: ['store']
most_specific: store
chosen method: most_specific

---
TYPES : ['real_estate_agency']

MAPPED: ['real_estate_agency']
most_frequent: ['real_estate_agency']
most_specific: real_estate_agency
chosen method: most_specific

---
TYPES : ['church', 'place_of_worship']

MAPPED: ['place_of_worship', 'place_of_worship']
most_frequent: ['place_of_worship']
most_specific: place_of_worship
chosen method: most_specific


---
TYPES : ['cafe', 'food', 'bakery', 'food', 'store']

MAPPED: ['cafe', 'store', 'bakery', 'store', 'store']
most_frequent: ['store']
most_specific: cafe
chosen method: most_specific

---
TYPES : ['restaurant', 'food']

MAPPED: ['restaurant', 'store']
most_frequent: ['restaurant', 'store']
most_specific: restaurant
chosen method: most_specific

---
TYPES : ['cafe', 'food', 'bakery', 'food', 'store']

MAPPED: ['cafe', 'store', 'bakery', 'store', 'store']
most_frequent: ['store']
most_specific: cafe
chosen method: most_specific

---
TYPES : ['clothing_store', 'store']

MAPPED: ['store', 'store']
most_frequent: ['store']
most_specific: store
chosen method: most_specific

---
TYPES : ['car_repair']

MAPPED: ['store']
most_frequent: ['store']
most_specific: store
chosen method: most_specific

---
TYPES : ['cafe', 'food', 'bakery', 'food', 'store']

MAPPED: ['cafe', 'store', 'bakery', 'store', 'store']
most_frequent: ['store']
most_specific: cafe
chosen method: most_specific

---
TYPES : [

---
TYPES : ['locality', 'political']

MAPPED: ['political', 'political']
most_frequent: ['political']
most_specific: political
chosen method: most_specific

---
TYPES : ['post_office', 'finance']

MAPPED: ['finance', 'finance']
most_frequent: ['finance']
most_specific: finance
chosen method: most_specific

---
TYPES : ['food']

MAPPED: ['store']
most_frequent: ['store']
most_specific: store
chosen method: most_specific

---
TYPES : ['bus_station', 'transit_station']

MAPPED: ['transit_station', 'transit_station']
most_frequent: ['transit_station']
most_specific: transit_station
chosen method: most_specific

---
TYPES : ['physiotherapist', 'health']

MAPPED: ['health', 'health']
most_frequent: ['health']
most_specific: health
chosen method: most_specific

---
TYPES : ['gym', 'health']

MAPPED: ['health', 'health']
most_frequent: ['health']
most_specific: health
chosen method: most_specific

---
TYPES : ['dentist', 'health', 'health']

MAPPED: ['health', 'health', 'health']
most_frequen

---
TYPES : ['locality', 'political']

MAPPED: ['political', 'political']
most_frequent: ['political']
most_specific: political
chosen method: most_specific

---
TYPES : ['gas_station']

MAPPED: ['convenience_store']
most_frequent: ['convenience_store']
most_specific: convenience_store
chosen method: most_specific

---
TYPES : ['bus_station', 'transit_station']

MAPPED: ['transit_station', 'transit_station']
most_frequent: ['transit_station']
most_specific: transit_station
chosen method: most_specific

---
TYPES : ['furniture_store', 'laundry', 'home_goods_store', 'store', 'general_contractor']

MAPPED: ['laundry', 'store', 'store', 'general_contractor']
most_frequent: ['store']
most_specific: laundry
chosen method: most_specific

---
TYPES : ['hardware_store', 'furniture_store', 'home_goods_store', 'store', 'atm', 'finance', 'clothing_store', 'bicycle_store', 'store', 'furniture_store', 'shopping_mall', 'home_goods_store', 'store', 'grocery_or_supermarket', 'supermarket', 'food', 'sto

['store'] <<< ['clothing_store', 'store']
['store'] <<< ['furniture_store', 'home_goods_store', 'store']
['health'] <<< ['dentist', 'health', 'health']
['restaurant'] <<< ['restaurant', 'food']
['HOME'] <<< ['HOME']
['health'] <<< ['dentist', 'health', 'health']
['store'] <<< ['car_repair']
['store'] <<< ['locksmith']
['store'] <<< ['clothing_store', 'store']
['store'] <<< ['locksmith']
['store'] <<< ['car_repair']
['transit_station'] <<< ['bus_station', 'transit_station']
NoCategoryMatched <<< ['police']
['health'] <<< ['pharmacy', 'health', 'store']
['health'] <<< ['dentist', 'health']
['cafe'] <<< ['cafe', 'food', 'bakery', 'food', 'store']
['HOME'] <<< ['HOME']
['store'] <<< ['locksmith']
['store'] <<< ['car_repair']
['store'] <<< ['car_repair']
['WORK'] <<< ['WORK']
['HOME'] <<< ['HOME']
['restaurant'] <<< ['restaurant', 'food']
['general_contractor'] <<< ['general_contractor']
['HOME'] <<< ['HOME']
['general_contractor'] <<< ['painter']
['health'] <<< ['pharmacy', 'health', 'stor

In [19]:
categ_mapper.map_categ(['cafe', 'food', 'bakery', 'food', 'store'], logs=True)


---
TYPES : ['cafe', 'food', 'bakery', 'food', 'store']

MAPPED: ['cafe', 'store', 'bakery', 'store', 'store']
most_frequent: ['store']
most_specific: cafe
chosen method: most_specific


'cafe'

In [20]:
len(pd.DataFrame(mapped_types))

41819

In [21]:
"aaabaa".replace("b", "x")

'aaaxaa'

In [22]:
def search_categ_by_types(types, mapped_types=mapped_types):
    types_str = str(types)
    types_str = types_str.replace("\"", "\'")
    
    df = pd.DataFrame(mapped_types)
    df["types"] = df["types"].astype(str).str.replace("\"", "\'")
    
    return df[df["types"] == types_str].drop_duplicates()

In [23]:
search_categ_by_types(['liquor_store', 'food', 'store'])

Unnamed: 0,category,types
479,liquor_store,"['liquor_store', 'food', 'store']"


In [24]:
for types in ['liquor_store', 'food', 'store'], ['bus_station', 'transit_station'], ['HOME'], ['museum', 'insurance_agency', 'finance', 'finance'],['parking'],['airport'],['place_of_worship'],['HOME']:
    print("types:", types)
    
    if types == ["WORK"] or types == ["HOME"]:
        print("category:", types)
    else:
        categ = search_categ_by_types(types)["category"]
        if len(categ) == 0:
            print("category:", "NOT FOUND")
        else:
            print("category:", categ)
    
    print()

types: ['liquor_store', 'food', 'store']
category: 479    liquor_store
Name: category, dtype: object

types: ['bus_station', 'transit_station']
category: 12    transit_station
Name: category, dtype: object

types: ['HOME']
category: ['HOME']

types: ['museum', 'insurance_agency', 'finance', 'finance']
category: NOT FOUND

types: ['parking']
category: 24    bar
Name: category, dtype: object

types: ['airport']
category: 11832    NoCategoryMatched
Name: category, dtype: object

types: ['place_of_worship']
category: 74    place_of_worship
Name: category, dtype: object

types: ['HOME']
category: ['HOME']



In [25]:
search_categ_by_types(['museum', 'insurance_agency', 'finance'])

Unnamed: 0,category,types


In [26]:
srg = objects_dao.load_stop_region_group_object("6015")

In [27]:
srg.sequence_report().head()

Unnamed: 0,delta_t_from_last_sr,distance,last_sr_type,sr_type,last_sr_semantics,sr_semantics,last_sr,sr,tags,last_tags
0,97363.0,27957.2,[[airport]],"[[liquor_store, food, store]]",[WORK],[],agg_6015_0,6015_3,"[liquor_store, food, store]",[WORK]
1,4438.0,1068.4,"[[liquor_store, food, store]]","[[bus_station, transit_station]]",[],[],6015_3,6015_4,"[bus_station, transit_station]","[liquor_store, food, store]"
2,1387462.0,1238.6,"[[bus_station, transit_station]]",[[hair_care]],[],[HOME],6015_4,agg_6015_5,[HOME],"[bus_station, transit_station]"
3,3391.0,1105.0,[[hair_care]],"[[museum], [insurance_agency, finance], [finan...",[HOME],[],agg_6015_5,6015_7,"[museum, insurance_agency, finance, finance]",[HOME]
4,50.0,86.1,"[[museum], [insurance_agency, finance], [finan...",[[parking]],[],[],6015_7,6015_8,[parking],"[museum, insurance_agency, finance, finance]"


In [28]:
srg.search_stop_region_by_id("6015_7").load_close_pois()

Unnamed: 0,name,place_id,plus_code,price_level,rating,types,user_ratings_total,vicinity,latitude,longitude,distance,position,sr_id
22,JPF Instruments,ChIJpUBZuz8fj0cR57dcqc6HOT4,"{'compound_code': '7GVM+CR Sierre, Switzerland...",,,"[museum, point_of_interest, establishment]",,"Avenue Max Huber 2, Sierre",46.293526,7.534571,13.021041,0,6015_7
5,"Zurich, Agence Generale Roger Besse",ChIJLTkT9Q4fj0cRkYAyM5r6tS8,"{'compound_code': '7GVM+CR Sierre, Switzerland...",,3.5,"[insurance_agency, finance, point_of_interest,...",2.0,"Avenue Max Huber 2, Sierre",46.293526,7.534571,13.021041,2,6015_7
28,Zurich Suisse - Agence,ChIJzQvj9A4fj0cRo9o_gZozQzM,"{'compound_code': '7GVM+CR Sierre, Switzerland...",,,"[finance, point_of_interest, establishment]",,"Avenue Max Huber 2, Sierre",46.293526,7.534571,13.021041,1,6015_7
19,Cinémas de Sierre Sàrl,ChIJFzF88Q4fj0cRGLh_baDoGvc,"{'compound_code': '7GVM+GP Sierre, Switzerland...",,5.0,"[movie_theater, point_of_interest, establishment]",1.0,"Avenue Max Huber 1, Sierre",46.293758,7.534314,20.067624,3,6015_7
11,Bourg,ChIJFzF88Q4fj0cRk9ftb643Ygg,"{'compound_code': '7GVM+GP Sierre, Switzerland...",,4.2,"[movie_theater, point_of_interest, establishment]",49.0,"Avenue Max Huber 1, Sierre",46.293758,7.534314,20.067624,4,6015_7
14,Coop Vitality Sierre,ChIJS9p9WA4fj0cRHURY-5mHZ18,"{'compound_code': '7GVM+GQ Sierre, Switzerland...",,4.1,"[pharmacy, health, store, point_of_interest, e...",8.0,"Avenue Max Huber 7, Sierre",46.293837,7.534488,26.602576,5,6015_7
25,Soho Lounge Café Sàrl,ChIJFzF88Q4fj0cRKULITUZaG6o,"{'compound_code': '7GVM+FJ Sierre, Switzerland...",,4.1,"[cafe, food, point_of_interest, establishment]",95.0,"Avenue Max Huber 1, Sierre",46.293642,7.534058,29.802015,6,6015_7
9,Coop Supermarché Sierre,ChIJkSj7Ww4fj0cRLiKQgoGv7co,"{'compound_code': '7GVM+HP Sierre, Switzerland...",,4.2,"[grocery_or_supermarket, food, store, point_of...",214.0,"Avenue Max Huber 6, Sierre",46.293876,7.534335,31.728198,7,6015_7
2,Roland Burgener,ChIJ8zSC7Q4fj0cRwU3lhjzrPZA,"{'compound_code': '7GVM+GJ Sierre, Switzerland...",,3.7,"[doctor, health, point_of_interest, establishm...",3.0,"Avenue du Château 4, Sierre",46.293752,7.534079,32.545304,10,6015_7
17,"CSS Assurance, Agence Sierre",ChIJJer97Q4fj0cRDVeR1oY_Ack,"{'compound_code': '7GVM+GJ Sierre, Switzerland...",,5.0,"[insurance_agency, health, point_of_interest, ...",4.0,"Avenue du Château 4, Sierre",46.293752,7.534079,32.545304,8,6015_7
