In [11]:
import pandas as pd
from timezonefinder import TimezoneFinder
import pytz
from geopy.distance import geodesic
import hashlib

In [None]:
# raw pokestop spin data comes in csv batches

dfs = [
    pd.read_csv('Pokestop_spin1.csv'),
    pd.read_csv('Pokestop_spin2.csv'),
    pd.read_csv('Pokestop_spin3.csv'),
    pd.read_csv('Pokestop_spin4.csv')
]

pokestops_all = pd.concat(dfs, ignore_index=True)


# convert timestamp field into utc-based datetime

pokestops_all['Timestamp'] = pd.to_datetime(
    pokestops_all['Timestamp'],
    utc=True,
    errors='coerce'
)

# Round fort coordinates as this will be used in map viz

pokestops_all['Fort_Latitude'] = pokestops_all['Fort_Latitude'].round(6)
pokestops_all['Fort_Longitude'] = pokestops_all['Fort_Longitude'].round(6)


# deduplication step 1

pokestops = pokestops_all.drop_duplicates(
    subset=['Timestamp', 'Fort_Latitude', 'Fort_Longitude']
)



In [41]:
# deduplication step 2: identify the double/multi-spins that happened at the same time
duplicates = pokestops[pokestops.duplicated(subset=['Timestamp'], keep=False)].sort_values('Timestamp')
print(f"Found {len(duplicates)} entries with duplicate timestamps")
print(f"Unique timestamps affected: {duplicates['Timestamp'].nunique()}")

Found 2988 entries with duplicate timestamps
Unique timestamps affected: 1492


In [42]:
# remove duplicates, keeping first occurrence; not ideal but is the easiest way
pokestops = pokestops.drop_duplicates(subset=['Timestamp'], keep='first')


In [43]:

pokestops = pokestops.copy()

#Determine timezone per row from player coordinates

tf = TimezoneFinder()

pokestops['player_timezone'] = pokestops.apply(
    lambda row: tf.timezone_at(lat=row['Player_Latitude'], lng=row['Player_Longitude']),
    axis=1
)

#Convert UTC Timestamp to local time using the timezone

def convert_to_local(row):
    if pd.isna(row['player_timezone']):
        return row['Timestamp']  # fallback to UTC
    tz = pytz.timezone(row['player_timezone'])
    return row['Timestamp'].tz_convert(tz)

pokestops['Timestamp_local'] = pokestops.apply(convert_to_local, axis=1)

# Extract local hour and weekday from each datetime object directly
pokestops['hour_local'] = pokestops['Timestamp_local'].apply(lambda x: x.hour if pd.notna(x) else None)
pokestops['weekday_local'] = pokestops['Timestamp_local'].apply(lambda x: x.strftime('%A') if pd.notna(x) else None)



In [45]:
#Compute distance between player and fort
pokestops['player_fort_distance_m'] = pokestops.apply(
    lambda row: geodesic(
        (row['Player_Latitude'], row['Player_Longitude']),
        (row['Fort_Latitude'], row['Fort_Longitude'])
    ).meters,
    axis=1
)

#Create unique fort_id
pokestops['fort_id'] = pokestops.apply(
    lambda row: hashlib.md5(f"{row['Fort_Latitude']}_{row['Fort_Longitude']}".encode()).hexdigest(),
    axis=1
)

In [None]:
# pull location information basewd on fort coordinates
# Nominatim only supports 1 request per second so caching the result

from geopy.geocoders import Nominatim
from time import sleep
import os

# Initialize the geocoder
geolocator = Nominatim(user_agent="pokemon_go_app")

def get_location_info(lat, lon):
    try:
        sleep(1.1)  # Respect rate limit
        location = geolocator.reverse(f"{lat}, {lon}", language='en')
        if location and location.raw.get('address'):
            address = location.raw['address']
            city = address.get('city') or address.get('town') or address.get('village') or address.get('municipality')
            country = address.get('country')
            return {'city': city, 'country': country, 'address':address}
        else:
            return {'city': None, 'country': None, 'address':None}
    except Exception as e:
        print(f"Error geocoding {lat}, {lon}: {e}")
        return {'city': None, 'country': None, 'address': None}

# Check if we have cached results
cache_file = 'pokestop_locations_cache.csv'

if os.path.exists(cache_file):
    print("Loading cached location data...")
    location_lookup = pd.read_csv(cache_file)
else:
    # Get unique coordinates
    unique_coords = pokestops[['Fort_Latitude', 'Fort_Longitude']].drop_duplicates()
    print(f"Geocoding {len(unique_coords)} unique locations...")
    
    location_data = []
    for idx, row in unique_coords.iterrows():
        result = get_location_info(row['Fort_Latitude'], row['Fort_Longitude'])
        result['Fort_Latitude'] = row['Fort_Latitude']
        result['Fort_Longitude'] = row['Fort_Longitude']
        location_data.append(result)
        
        # Optional: Show progress
        if (len(location_data)) % 10 == 0:
            print(f"Processed {len(location_data)}/{len(unique_coords)} locations...")
    
    location_lookup = pd.DataFrame(location_data)
    
    # Save to CSV cache
    location_lookup.to_csv(cache_file, index=False)
    print(f"Saved {len(location_lookup)} locations to {cache_file}")

# Merge back to original dataframe
pokestops = pokestops.merge(
    location_lookup,
    on=['Fort_Latitude', 'Fort_Longitude'],
    how='left'
)

print(f"Added city and country to {len(pokestops)} pokestop records")

In [None]:
# taking a slice of data for map viz demo

poke_jp = pokestops[
    (pokestops['country'] == 'Japan') &
    (pokestops['Timestamp_local'] < pd.Timestamp('2022-12-29 16:00:00', tz='Asia/Tokyo')) 
     & (pokestops['Timestamp_local'] > pd.Timestamp('2022-12-26', tz='Asia/Tokyo'))
].sort_values('Timestamp_local')


In [54]:
# saving the processed data

pokestops.to_csv('pokestops.csv', index=False, encoding='utf-8')
poke_jp.to_csv('hokkaido.csv', index=False, encoding='utf-8')