# Feature Engineering for Geo Model

This notebook creates geographic features from the Foursquare data and should therefore be run __after__ ``download_foursquare.ipynb``.

### Import packages

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
from math import log2
import warnings
warnings.filterwarnings(action="ignore")

from tqdm.notebook import tqdm
import pandas as pd

import multiprocessor_wiki

Definde constants.

- ``PATH``: Path to the base data folder
- ``MAX_DIST``: Maximum radius around each venue to consider
- ``R_EARTH``: Approximate radius of the earth in m
- ``CITY_CENTRE``: Coordinates of the city centre of the chosen city.

In [3]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_rss/"
MAX_DIST = 200
R_EARTH = 6371000.0
CITY_CENTER = (40.730610, -73.935242)

Load venue and target category data.

In [4]:
structured = pd.read_csv(PATH + "structured_preprocessed.csv")
print(structured.shape)
structured.head()

(2836, 7)


Unnamed: 0,venue_id,lat,long,borough,category,org_category,total_visits
0,3fd66200f964a52001e51ee3,40.726961,-73.980039,Manhattan,Bar,Dive Bar,1
1,3fd66200f964a52003e51ee3,40.724822,-73.981456,Manhattan,Bar,Dive Bar,15
2,3fd66200f964a52010e51ee3,40.727027,-73.982702,Manhattan,Bar,Dive Bar,14
3,3fd66200f964a52011e81ee3,40.762812,-73.967519,Manhattan,Bar,Dive Bar,18
4,3fd66200f964a52018e51ee3,40.725112,-73.981278,Manhattan,Bar,Dive Bar,29


In [5]:
venues = pd.read_csv(PATH + "venue_visits_city.csv")
print(venues.shape)
venues.head()

(71554, 5)


Unnamed: 0,venue_id,lat,long,category,total_visits
0,3fd66200f964a52000e71ee3,40.733596,-74.003139,Jazz Club,86
1,3fd66200f964a52000e81ee3,40.758102,-73.975734,Gym,14
2,3fd66200f964a52000ea1ee3,40.732456,-74.003755,Indian Restaurant,2
3,3fd66200f964a52000f11ee3,40.652766,-74.003092,Bowling Alley,9
4,3fd66200f964a52001e51ee3,40.726961,-73.980039,Dive Bar,1


Recode bus features for our accessibility feature later.

In [6]:
print(*[x for x in pd.unique(venues["category"]) if "bus" in x.lower()], sep="\n")

Bus Station
Bus Line


In [7]:
print(*[x for x in pd.unique(venues["category"]) if "sub" in x.lower()], sep="\n")

Subway


In [8]:
venues.loc[venues["category"].isin(["Bus Station", "Bus Line"]), "category"] = "Bus"
print(*[x for x in pd.unique(venues["category"]) if "bus" in x.lower()], sep="\n")

Bus


Define all necessary geographic features.

In [9]:
def calc_rect_dist(lat, long, dy, dx):
    lat_1  = lat  + (-dy / R_EARTH) * (180 / math.pi)
    lat_2  = lat  + (dy / R_EARTH) * (180 / math.pi)
    long_1 = long + (-dx / R_EARTH) * (180 / math.pi) / math.cos(lat * math.pi/180)
    long_2 = long + (dx / R_EARTH) * (180 / math.pi) / math.cos(lat * math.pi/180)
    return ((lat_1, lat_2), (long_1, long_2))

def eucl_dist(vec_1, vec_2):
    return math.sqrt(sum([(x-y)**2 for x,y in zip(vec_1, vec_2)]))

def get_closest(lat, long, _df):
    dists = _df.loc[:, "lat":"long"].apply(lambda x: eucl_dist((x[0], x[1]), (lat, long)), axis=1)
    return min(dists)
    

def get_features(row, venues, categories, max_dist):
    (lat_1, lat_2), (long_1, long_2) = calc_rect_dist(row["lat"], row["long"], max_dist, max_dist)
    _df = venues[(lat_1 < venues["lat"]) & (venues["lat"] < lat_2)]
    _df = _df[(long_1 < _df["long"]) & (_df["long"] < long_2)]
    
    # add number of venues for each category near
    for cat in categories:
        cat_form = cat.lower().replace(" ", "_")
        row[f"{cat_form}_count"] = _df[_df["category"]==cat].shape[0]
    
    # area_density
    row["area_density"] = _df.shape[0]
    
    # area entropy
    entropies = []
    for cat in categories:
        cat_form = cat.lower().replace(" ", "_")
        if row["area_density"] > 0 and row[f"{cat_form}_count"] > 0:
            frac = row[f"{cat_form}_count"] / row["area_density"]
            entropies.append(frac * log2(frac))
        else:
            entropies.append(0)
    row["area_entropy"] = -sum(entropies)
    
    # competition
    row["competition"] = -(_df[_df["category"] == row["category"]].shape[0] / row["area_density"])
    
    # traffic accessibility
    _df_bus = venues[venues["category"]=="Bus"]
    _df_sub = venues[venues["category"]=="Subway"]
    bus_frac = log2(_df_bus.shape[0]+1) / log2(get_closest(*row.loc["lat":"long"], _df_bus))
    sub_frac = log2(_df_sub.shape[0]+1) / log2(get_closest(*row.loc["lat":"long"], _df_sub))
    row["traffic_access"] = bus_frac + sub_frac
    
    # distance to center
    row["dist_center"] = eucl_dist(row.loc["lat":"long"], CITY_CENTER)
    
    return row

Process all venues and calculate the features.

In [10]:
tqdm.pandas()
categories = pd.unique(venues["category"])
venues_added = structured.progress_apply(get_features, axis=1, result_type="expand",
                                         args=[venues, categories, MAX_DIST])

  0%|          | 0/2836 [00:00<?, ?it/s]

Save structured + geo features.

In [11]:
venues_added.rename({"lat":"latitude", "long":"longitude"}, axis=1, inplace=True)
venues_added.to_csv(PATH + "structured_geo_features.csv", index=False)
print(venues_added.shape)
venues_added.head()

(2836, 434)


Unnamed: 0,venue_id,latitude,longitude,borough,category,org_category,total_visits,jazz_club_count,gym_count,indian_restaurant_count,...,cricket_ground_count,campaign_office_count,rock_climbing_spot_count,yogurt_count,volcano_count,area_density,area_entropy,competition,traffic_access,dist_center
0,3fd66200f964a52001e51ee3,40.726961,-73.980039,Manhattan,Bar,Dive Bar,1,0,0,0,...,0,0,0,0,0,79,5.301005,-0.101266,-2.349061,0.044945
1,3fd66200f964a52003e51ee3,40.724822,-73.981456,Manhattan,Bar,Dive Bar,15,0,1,0,...,0,0,0,0,0,102,5.620873,-0.04902,-2.443636,0.046575
2,3fd66200f964a52010e51ee3,40.727027,-73.982702,Manhattan,Bar,Dive Bar,14,0,0,0,...,0,0,0,0,0,184,5.954679,-0.076087,-2.05185,0.047595
3,3fd66200f964a52011e81ee3,40.762812,-73.967519,Manhattan,Bar,Dive Bar,18,0,3,1,...,0,0,0,0,0,229,6.029996,-0.0,-1.77675,0.045594
4,3fd66200f964a52018e51ee3,40.725112,-73.981278,Manhattan,Bar,Dive Bar,29,0,1,0,...,0,0,0,0,0,107,5.547164,-0.065421,-2.432629,0.046363
