# Using CategoryMapper (Google Places)

In [12]:
import os

os.chdir("/home/tales/dev/master/mdc_analysis/")
print("working dir", os.getcwd())

import pandas as pd
import matplotlib.pyplot as plt
import json
import urllib.request

from src.poi_grabber import google_places
from src.dao import csv_dao
from src.entity.stop_region import StopRegionGroup

from src.utils.type_hierarchy_analysis import parent, children, relations_freq, occurrences_in_visited_pois

working dir /home/tales/dev/master/mdc_analysis


ImportError: cannot import name 'occurrences_in_visited_pois'

In [2]:
url = "http://127.0.0.1:5000/stop_regions_group"
response = urllib.request.urlopen(url)
users_tags_sequence = json.loads(response.read())

In [3]:
# users = os.listdir("outputs/stop_regions/")
# users.reverse()

# for user_id in users:
#     print("Loading user {} data".format(user_id))
#     users_tags_sequence[user_id] = StopRegionGroup(csv_dao.stop_region_sequence(user_id), 
#                                                     agglutinate_stop_regions=True).sequence_stop_region_tags()["tag"].tolist()

In [4]:
pois = google_places.load_all_google_places_data(valid_pois=True)

In [28]:
pois_freq = pd.read_csv("outputs/taxonomy/google_places/pois_frequencies.csv").set_index("type")
pois_freq.head()

Unnamed: 0_level_0,index,freq_all,freq_visited,prop_all,prop_visited
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
store,0,11788,9864,0.282,0.201
food,1,7858,8970,0.188,0.183
restaurant,2,4830,5327,0.116,0.109
health,3,8062,4132,0.193,0.084
transit_station,4,1688,3407,0.04,0.07


# Automatic Category Hierarchy

### Relations

In [16]:
relations_all_freq = relations_freq(pois)

In [17]:
relations_all_freq.head(12)

Unnamed: 0,parent,category,freq
0,.,establishment,40794
213,establishment,point_of_interest,40517
570,point_of_interest,store,11486
571,point_of_interest,health,6778
572,point_of_interest,food,4740
239,food,restaurant,4501
663,restaurant,,3779
738,store,food,2874
573,point_of_interest,finance,2865
339,health,,2780


# Using CategoryMapper

In [7]:
from src.taxonomy.category_mapping import CategoryMapper
from src.exceptions.exceptions import NoCategoryMatched
from src.utils.others import remove_list_elements

In [8]:
categ_mapper = CategoryMapper()

In [9]:
len(pois)

41739

## Mapping each POI from <font color="red">all</font> POIs

In [10]:
mapped_types = []
categ_not_found = []

counter = 0

for i, poi in pois.iterrows():
    counter += 1
    if counter % 5000 == 0 or counter == len(pois):
        print("{}%".format(round(counter * 100 / len(pois), 2)))
    
    types = remove_list_elements(poi["types"], elements=['premise', 'point_of_interest', 'establishment'])
    
    try:
        category = categ_mapper.map_categ(types)
        mapped_types.append({"types": types, "category": category})
        
    except NoCategoryMatched:
        categ_not_found.append(types)

16.77%
33.54%
50.31%
67.08%
83.85%
100.0%


In [11]:
print("Types not mapped to any category")
pd.Series(categ_not_found).astype(str).value_counts()

Types not mapped to any category


['park']                            211
['laundry']                         165
['library']                         143
['shopping_mall']                   134
['car_rental']                      101
['movie_theater']                    93
['cemetery']                         53
['police']                           45
['car_wash']                         36
['moving_company']                   20
['roofing_contractor']               16
['amusement_park']                   14
['stadium']                          13
['bowling_alley']                     6
['courthouse']                        5
['natural_feature']                   4
['car_rental', 'moving_company']      2
['park', 'laundry']                   2
['movie_rental']                      1
['route']                             1
['aquarium']                          1
['moving_company', 'laundry']         1
dtype: int64

In [24]:
import ast 

for types in pd.Series(categ_not_found).astype(str).value_counts().index:
    print("\n-----")
    print("types:", types)
    print()
    
    for a_type in ast.literal_eval(types):
        print(">>>", a_type)
        print("visited :", occurrences_in_visited_pois(a_type))
        print("all     :", occurrences_in_all_pois(a_type))

        print("PARENT")
        print(parent(a_type, relations_all_freq))
        print("\nCHILDREN")
        print(children(a_type, relations_all_freq))
        print()

#parent("point_of_interest")


-----
types: ['park']

>>> park
visited : {'freq': 591, 'prop': 0.012}
all     : {'freq': 260, 'prop': 0.006}
PARENT
                 parent category  freq      prop
595   point_of_interest     park   174  0.669231
648             premise     park    44  0.169231
461             lodging     park    23  0.088462
768               store     park     5  0.019231
300  general_contractor     park     5  0.019231
683          restaurant     park     2  0.007692
424             laundry     park     2  0.007692
283     furniture_store     park     2  0.007692
698             rv_park     park     1  0.003846
267                food     park     1  0.003846
101                cafe     park     1  0.003846

CHILDREN
    parent        category  freq      prop
526   park            NULL   219  0.842308
527   park      campground    21  0.080769
528   park         premise     4  0.015385
529   park   travel_agency     4  0.015385
531   park         florist     2  0.007692
530   park  amusement_park

all     : {'freq': 14, 'prop': 0.0}
PARENT
                parent       category  freq      prop
628  point_of_interest  bowling_alley     6  0.428571
542            parking  bowling_alley     2  0.142857
682         restaurant  bowling_alley     2  0.142857
52                 bar  bowling_alley     2  0.142857
284    furniture_store  bowling_alley     1  0.071429
487      meal_takeaway  bowling_alley     1  0.071429

CHILDREN
           parent category  freq  prop
79  bowling_alley     NULL    14   1.0


-----
types: ['courthouse']

>>> courthouse
visited : {'freq': 2, 'prop': 0.0}
all     : {'freq': 7, 'prop': 0.0}
PARENT
                parent    category  freq  prop
626  point_of_interest  courthouse     7   1.0

CHILDREN
         parent category  freq  prop
168  courthouse     NULL     7   1.0


-----
types: ['natural_feature']

>>> natural_feature
visited : {'freq': 11, 'prop': 0.0}
all     : {'freq': 7, 'prop': 0.0}
PARENT
                parent         category  freq  prop
627 

In [None]:
parent("liquor_store")

In [None]:
categories["level_extra"] = ["natural_feature"]

In [None]:
categories["level_extra"] = ["natural_feature"]
categories["level_extra"].append("laundry")
categories["level_extra"].append("library")

categories

In [None]:
with open('outputs/taxonomy/google_places/categories.csv', 'w') as outfile:
    json.dump(categories, outfile)

In [21]:
def occurrences_in_all_pois(term, pois_types=pois["types"], round_n=3):
    occurrences = pois_types.apply(lambda types: term in types).value_counts()
    try:
        freq = occurrences.loc[True]
    except KeyError:
        freq = 0

    prop = freq / len(pois_types)

    if round_n:
        prop = round(prop, round_n)

    return {"freq": freq, "prop": prop}