# Preparation of category reduction lists

In [None]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import pickle

### Collect all categories from Foursquare

In [3]:
# Load the Credentials for the APIs that will be used
# FOURSQUARE
f = open('Credentials_Foursquare_API.pckl', 'rb')
cred = pickle.load(f)
f.close()
CLIENT_ID = cred['CLIENT_ID']
CLIENT_SECRET = cred['CLIENT_SECRET']
VERSION = '20180604'
LIMIT = 2


In [4]:
url = 'https://api.foursquare.com/v2/venues/categories?client_id={}&client_secret={}&v={}'.format(CLIENT_ID, CLIENT_SECRET,VERSION)
results = requests.get(url).json()

In [29]:
# Generate a dataframe where all the categories will be stored for clarity
columns_name=['cat_id','cat','subcat1','subcat2','subcat3','subcat4']
cat_df=pd.DataFrame(columns=columns_name)

# Generate a dictionary that will contain all the categories to be considered as keys
# and lists of all subcategories not be considered
cat_lists_dic={}

# A counter of the total categories that Foursquare has declared considering is initialised
total_cat=0

#Instruction to the user to generate the lists of categories
print('The {} main categories are always considered.'.format(len(results['response']['categories'])))
print("You'll be asked wether to consider further subcategories. Please answer (y/n, n by default)")
print('Case you do not wish to consider further subcategories, those will be merged to their parent category.')

# A set of nested loops will run and capture all the categories.
# Note that Foursquare has nested categories up to 5 levels.
# at the same time will interrogate the user as indicated previously

# Loop on main categories
for i in range(0,len(results['response']['categories'])):
    total_cat=total_cat+1
    # Collect the information from results
    catname=results['response']['categories'][i]['name']
    catid=results['response']['categories'][i]['id']
    catchilds=len(results['response']['categories'][i]['categories'])
    
    # Store the information in the dataframe and the dictionary
    # The category is added to the categories dataframe
    cat_df=cat_df.append({'cat_id':catid,
                          'cat':catname},
                           ignore_index=True)
    # The category name provides the name to te new dictionary, and at the same time is the first element of the list 
    cat_lists_dic[catname]=[catname]

    # The user is interrogated 
    subcat1q = input("Main category {} has {} childs. Do you wand to consider them? (y/n/l, l to se list, n by default)".format(catname,catchilds))
    if str.lower(subcat1q)=='l':
        print(results['response']['categories'][i]['categories'])
        subcat1q = input("Do you wand to consider them? (y/n, n by default)")
    if str.lower(subcat1q) !='y': subcat1q='n'

# Loop on subcategories level 1
    for j in range(0,len(results['response']['categories'][i]['categories'])):
        total_cat=total_cat+1
        # Collect the information from results
        subcat1name=results['response']['categories'][i]['categories'][j]['name']
        subcat1id=results['response']['categories'][i]['categories'][j]['id']
        subcat1childs=len(results['response']['categories'][i]['categories'][j]['categories'])
        
        # Store the information in the dataframe and the dictionary
        # The category is added to the categories dataframe
        cat_df=cat_df.append({'cat_id':subcat1id,
                              'cat':catname,
                              'subcat1':subcat1name},
                               ignore_index=True) 
        # If the subcategory is to be considered, it provides the name to te new dictionary, and at the same time is the first element of the list 
        # alternatively is added to the parent list
        if subcat1q=='n':
            cat_lists_dic[catname].append(subcat1name)
            subcat2q='n'
        else:
            cat_lists_dic[subcat1name]=[subcat1name]
            if subcat1childs != 0:
                # The user is interrogated 
                subcat2q = input("Subcategory 1 {} has {} childs. Do you wand to consider them? (y/n/l, l to se list, n by default)".format(subcat1name,subcat1childs))
            if str.lower(subcat2q)=='l':
                print(results['response']['categories'][i]['categories'][j]['categories'])
                subcat2q = input("Do you wand to consider them? (y/n, n by default)")
            if str.lower(subcat2q) !='y': subcat2q='n'


# Loop on subcategories level 2
        for k in range(0,len(results['response']['categories'][i]['categories'][j]['categories'])):
            total_cat=total_cat+1
            # Collect the information from results
            subcat2name=results['response']['categories'][i]['categories'][j]['categories'][k]['name']
            subcat2id=results['response']['categories'][i]['categories'][j]['categories'][k]['id']
            subcat2childs=len(results['response']['categories'][i]['categories'][j]['categories'][k]['categories'])
            
            # Store the information in the dataframe and the dictionary
            # The category is added to the categories dataframe
            cat_df=cat_df.append({'cat_id':subcat2id,
                                  'cat':catname,
                                  'subcat1':subcat1name,
                                  'subcat2':subcat2name},
                                   ignore_index=True) 
            # If the subcategory is to be considered, it provides the name to te new dictionary, and at the same time is the first element of the list 
            # alternatively is added to the parent list
            if subcat1q=='n':
                cat_lists_dic[catname].append(subcat2name)
                subcat3q='n'
            elif subcat2q=='n':
                cat_lists_dic[subcat1name].append(subcat2name)
                subcat3q='n'
            else:
                cat_lists_dic[subcat2name]=[subcat2name]
                if subcat2childs != 0:
                    # The user is interrogated 
                    subcat3q = input("Subcategory 2 {} has {} childs. Do you wand to consider them? (y/n/l, l to se list, n by default)".format(subcat2name,subcat2childs))
                if str.lower(subcat3q)=='l':
                    print(results['response']['categories'][i]['categories'][j]['categories'][k]['categories'])
                    subcat3q = input("Do you wand to consider them? (y/n, n by default)")
                if str.lower(subcat3q) !='y': subcat3q='n'

# Loop on subcategories level 3
            for l in range(0,len(results['response']['categories'][i]['categories'][j]['categories'][k]['categories'])):
                total_cat=total_cat+1
                # Collect the information from results
                subcat3name=results['response']['categories'][i]['categories'][j]['categories'][k]['categories'][l]['name']
                subcat3id=results['response']['categories'][i]['categories'][j]['categories'][k]['categories'][l]['id']
                subcat3childs=len(results['response']['categories'][i]['categories'][j]['categories'][k]['categories'][l]['categories'])
            
                # Store the information in the dataframe and the dictionary
                # The category is added to the categories dataframe
                cat_df=cat_df.append({'cat_id':subcat3id,
                                      'cat':catname,
                                      'subcat1':subcat1name,
                                      'subcat2':subcat2name,
                                      'subcat3':subcat3name},
                                       ignore_index=True)
                # If the subcategory is to be considered, it provides the name to te new dictionary, and at the same time is the first element of the list 
                # alternatively is added to the parent list
                if subcat1q=='n':
                    cat_lists_dic[catname].append(subcat3name)
                elif subcat2q=='n':
                    cat_lists_dic[subcat1name].append(subcat3name)
                elif subcat3q=='n':
                    cat_lists_dic[subcat2name].append(subcat3name)
                else:
                    cat_lists_dic[subcat3name]=[subcat3name]
                    
# Loop on subcategories level 4
                for m in range(0,len(results['response']['categories'][i]['categories'][j]['categories'][k]['categories'][l]['categories'])):
                    total_cat=total_cat+1
                    # Collect the information from results
                    subcat4name=results['response']['categories'][i]['categories'][j]['categories'][k]['categories'][l]['categories'][m]['name']
                    subcat4id=results['response']['categories'][i]['categories'][j]['categories'][k]['categories'][l]['categories'][m]['id']
                    subcat4childs=len(results['response']['categories'][i]['categories'][j]['categories'][k]['categories'][l]['categories'][m]['categories'])
                    # A check whether further levels of categories have been developed is performed
                    if subcat4childs != 0: 
                        print('WARNING!')
                        print('subcat2name {}.{}.{}.{}.{} is: {} - has {} childs,'.format(i,j,k,l,m,subcat4name,subcat4childs))
                        print("and won't be added to any list!, please review code or disrigard any category reduction.")

                    # Store the information in the dataframe and the dictionary
                    # The category is added to the categories dataframe
                    cat_df=cat_df.append({'cat_id':subcat4id,
                                          'cat':catname,
                                          'subcat1':subcat1name,
                                          'subcat2':subcat2name,
                                          'subcat3':subcat3name,
                                          'subcat4':subcat4name},
                                           ignore_index=True)
                    # If the subcategory is to be considered, it provides the name to te new dictionary, and at the same time is the first element of the list 
                    # alternatively is added to the parent list
                    if subcat1q=='n':
                        cat_lists_dic[catname].append(subcat4name)
                    elif subcat2q=='n':
                        cat_lists_dic[subcat1name].append(subcat4name)
               n
            elif subcat3q=='n':
                        cat_lists_dic[subcat2name].append(subcat4name)
                    else:
                        cat_lists_dic[subcat3name].append(subcat4name)

                    
print('Total categories listed: ',total_cat)
print('Dataframe generated shape',cat_df.shape)
print('The lists of categories created are {}, and their names are:'.format(len(cat_lists_dic.keys())))
cat_lists_dic.keys()

The 10 main categories are always considered.
You'll be asked wether to consider further subcategories. Please answer (y/n, n by default)
Case you do not wish to consider further subcategories, those will be merged to their parent category.


Main category Arts & Entertainment has 36 childs. Do you wand to consider them? (y/n/l, l to se list, n by default) n
Main category College & University has 23 childs. Do you wand to consider them? (y/n/l, l to se list, n by default) n
Main category Event has 12 childs. Do you wand to consider them? (y/n/l, l to se list, n by default) n
Main category Food has 92 childs. Do you wand to consider them? (y/n/l, l to se list, n by default) n
Main category Nightlife Spot has 7 childs. Do you wand to consider them? (y/n/l, l to se list, n by default) n
Main category Outdoors & Recreation has 62 childs. Do you wand to consider them? (y/n/l, l to se list, n by default) n
Main category Professional & Other Places has 43 childs. Do you wand to consider them? (y/n/l, l to se list, n by default) n
Main category Residence has 5 childs. Do you wand to consider them? (y/n/l, l to se list, n by default) n
Main category Shop & Service has 145 childs. Do you wand to consider them? (y/n/l, l to se list, n

Total categories listed:  941
Dataframe generated shape (941, 6)
The lists of categories created are 10, and their names are:


dict_keys(['Arts & Entertainment', 'College & University', 'Event', 'Food', 'Nightlife Spot', 'Outdoors & Recreation', 'Professional & Other Places', 'Residence', 'Shop & Service', 'Travel & Transport'])

## Split some of the main categories

In [30]:
# Split some of the lists
# Slit Food list into 3: Restaurant, Cafe, Food (the remaning)
print('Food list length: ',len(cat_lists_dic['Food']))

Restaurants = [venue for venue in cat_lists_dic['Food'] if ('Restaurant' in venue or 'Joint' in venue or 'Steakhouse' in venue)]
Cafe = [venue for venue in cat_lists_dic['Food'] if ('Café' in venue or 'Coffee' in venue or 'Tea ' in venue)]
Food_remaining = cat_lists_dic['Food']
print('new lists length: ',len(Restaurants),len(Cafe),len(Food_remaining))


Food list length:  349
new lists length:  257 6 349


In [31]:
for venue in Restaurants:
    Food_remaining.remove(venue)
for venue in Cafe:
    Food_remaining.remove(venue)
    
print('new lists length: ',len(Restaurants),len(Cafe),len(Food_remaining))

cat_lists_dic['Food']=Food_remaining
cat_lists_dic['Restaurants']=Restaurants
cat_lists_dic['Cafe']=Cafe

cat_lists_dic.keys()

new lists length:  257 6 86


dict_keys(['Arts & Entertainment', 'College & University', 'Event', 'Food', 'Nightlife Spot', 'Outdoors & Recreation', 'Professional & Other Places', 'Residence', 'Shop & Service', 'Travel & Transport', 'Restaurants', 'Cafe'])

## Safe dictionary and df of lists in two separte files

In [34]:
#S ave the Lists in a pickle file
file_name = input('Provide the file name where the dictionary will be saved, DO NOT include the extension: ')
file_name=file_name+'.pckl'
f = open(file_name, 'wb')
pickle.dump(cat_lists_dic, f)
f.close()
print('File saved')

Provide the file name where the dictionary will be saved, DO NOT include the extension:  Cat_red_13


File saved


In [7]:
# Save the dataframe in a pickle file
file_name = 'Cat_all_df.pckl'
f = open(file_name, 'wb')
pickle.dump(cat_df, f)
f.close()
print('File saved')

File saved


In [9]:
# Save the dataframe in an xls file as well
cat_df.to_excel(excel_writer="cat_df.xls",sheet_name='Categories',index=False)

#### Load files to review

In [3]:
# Load the Category reduction criteria 
f = open('Cat_red_13.pckl', 'rb')
cat_lists_dic = pickle.load(f)
f.close()

In [7]:
cat_lists_dic.keys()

dict_keys(['Arts & Entertainment', 'College & University', 'Event', 'Food', 'Nightlife Spot', 'Outdoors & Recreation', 'Professional & Other Places', 'Residence', 'Shop & Service', 'Travel & Transport', 'Restaurants', 'Cafe'])