# Visualizing Swarm Check-in Data

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import foursquare
import json
import csv
from datetime import timedelta 
import googlemaps #https://github.com/googlemaps/google-maps-services-python
gmaps = googlemaps.Client(key='')

## Connect to Foursquare API

In [2]:
# Authentication
# Construct the client object
CLIENT_ID = ''
CLIENT_SECRET = ''
AUTHORIZATION_BASE_URL = 'https://foursquare.com/oauth2/authenticate'
TOKEN_URL = 'https://foursquare.com/oauth2/access_token'

client = foursquare.Foursquare(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, redirect_uri='https://liwilli.am/oauth/authorize')

# Build the authorization url for your app
auth_uri = client.oauth.auth_url()
print(auth_uri)

https://foursquare.com/oauth2/authenticate?client_id=VYWRXMFHL2YE1VLF1XORHPSW2AGEPCQ5C31VUIRSICKHMZ4D&response_type=code&redirect_uri=https%3A%2F%2Fliwilli.am%2Foauth%2Fauthorize


In [3]:
# Update token from authorization url here
token = ''
access_token = client.oauth.get_token(token)

# Apply the returned access token to the client
client.set_access_token(access_token)

# User most recent data and profile information
user = client.users()

## Data

In [4]:
# Grab all checkins since account creation
all_checkins = client.users.checkins()

In [5]:
# Grab all checkins since account creation
x = client.users.all_checkins()

# Convert generator object to list
lst = list(x)

In [6]:
# Parse each check-in for specific fields
checkins = []
for checkin in lst:
    # Key:item pairs don't consistently appear for each checkin, need error exception
    try:
        created_at = checkin['createdAt']
    except:
        created_at = ''
    try:
        timezone_offset = checkin['timeZoneOffset']
    except:
        timezone_offset = ''
    try:
        venue = checkin['venue']['name']
    except:
        venue = ''
    try:
        venue_lat = checkin['venue']['location']['lat']
    except:
        venue_lat = ''
    try: 
        venue_lon = checkin['venue']['location']['lng']
    except:
        venue_lon
    try: 
        venue_postalcode = checkin['venue']['location']['postalCode']
    except:
        venue_postalcode = ''
    try: 
        venue_city = checkin['venue']['location']['city']
    except:
        venue_city = ''
    try: 
        venue_state = checkin['venue']['location']['state']
    except:
        venue_state = ''
    try: 
        venue_country = checkin['venue']['location']['country']
    except:
        venue_country = ''
    try: 
        venue_category = checkin['venue']['categories'][0]['name']
    except:
        venue_category = ''
        
    complete_data = [created_at, timezone_offset, venue, venue_lat, venue_lon, venue_postalcode, venue_city, venue_state, venue_country, venue_category]
    
    # Aggregate to single list
    checkins.append(complete_data)

In [7]:
# Convert to pandas dataframe
header = ['created_at', 'timezone_offset', 'venue', 'venue_lat', 'venue_lon', 'venue_postalcode', 'venue_city', 'venue_state', 'venue_country', 'venue_category']
df = pd.DataFrame(checkins, columns = header)

In [8]:
# Clean-up check-in created_at time
# Convert epoch time to datetime
df['created_at'] = pd.to_datetime(df['created_at'], unit='s')

# Function to adjust timezone to be consistently local time
def fix_timezone(row):
    return row['created_at'] + timedelta(minutes = row['timezone_offset'])

# Adjust timezones
df['created_at_adjusted'] = df.apply(fix_timezone, axis = 1)

In [9]:
# Fill in blank venue_city with Google Maps geocoding
# Need to set condition country by country, city data doesn't appear with in the same spot when calling reverse_geocode
def fix_cityname(row):
    reverse_geocode_result = gmaps.reverse_geocode((str(row['venue_lat']) + ', ' + str(row['venue_lon'])))
    if row['venue_city'] == '' or row['venue_city'] is None:
        if row['venue_country'] == 'United States':
            city = reverse_geocode_result[0]['address_components'][2]['long_name']
        if row['venue_country'] == 'South Korea':
            city = reverse_geocode_result[0]['address_components'][4]['long_name']
        if row['venue_country'] == 'France':
            city = reverse_geocode_result[0]['address_components'][2]['long_name']
        return city

    else:
        return row['venue_city']

In [10]:
df['venue_city'] = df.apply(fix_cityname, axis = 1)

In [11]:
# Parse out check-in times for hour and day of week for more granular charting
df['created_at_adjusted_hour'] = df['created_at_adjusted'].dt.hour
df['created_at_adjusted_weekday'] = df['created_at_adjusted'].dt.weekday

In [12]:
# Aggregate data by venue, city, and venue category for charting
df_venue = df.groupby(['venue']).count()['created_at'].sort_values(ascending=False)
df_city = df.groupby(['venue_city']).count()['created_at'].sort_values(ascending=False)
df_category = df.groupby(['venue_category']).count()['created_at'].sort_values(ascending=False)

In [13]:
# Create csv for Tableau dashboard
df.to_csv('swarm data.csv')

## Visualizing the check-ins

In [49]:
# Plot distribution by hour
fig = go.Figure()
fig.add_trace(go.Bar(x = df_venue[:10].index, y = df_venue[:10]))
fig.update_layout(title = 'Top 10 venues', xaxis_title = 'Venue', yaxis_title = 'Number of check-ins')
fig.show()

In [50]:
# Plot distribution by hour
fig = go.Figure()
fig.add_trace(go.Bar(x = df_city[:10].index, y = df_city[:10]))
fig.update_layout(title = 'Top 10 cities', xaxis_title = 'City', yaxis_title = 'Number of check-ins')
fig.show()

In [53]:
# Plot distribution by hour
fig = go.Figure()
fig.add_trace(go.Bar(x = df_category[:10].index, y = df_category[:10]))
fig.update_layout(title = 'Top 10 cities', xaxis_title = 'Venue category', yaxis_title = 'Number of check-ins')
fig.show()

In [40]:
# Plot distribution by day of week
by_day = df.groupby(by='created_at_adjusted_weekday', as_index=False).count()
fig = go.Figure()
fig.add_trace(go.Scatter(y = by_day['created_at'], x = by_day['created_at_adjusted_weekday']))
fig.update_layout(title = 'Check-ins by day of week', xaxis_title = 'Weekday', yaxis_title = 'Number of check-ins')
fig.show()

In [35]:
# Plot distribution by hour
by_hour = df.groupby(by='created_at_adjusted_hour', as_index=False).count()
fig = go.Figure()
fig.add_trace(go.Scatter(y = by_hour['created_at'], x = by_hour['created_at_adjusted_hour']))
fig.update_layout(title = 'Check-ins by time of day', xaxis_title = 'Hour', yaxis_title = 'Number of check-ins')
fig.show()