In [1]:
#countries geojson source = https://datahub.io/core/geo-countries#data
#states geojson source = https://public.opendatasoft.com/explore/dataset/us-state-boundaries/table/

In [2]:
import pandas as pd
import json
import geocoder
import geopandas as gpd
from datetime import datetime
from shapely.geometry import LineString, MultiPoint
from perpetual_functions import rangeSummary
from perpetual_functions import placeCounts

In [3]:
#READ IN DATA, FORMAT, AND SAVE TO CORRECT PLACES
log = pd.read_csv('lifelog_current.csv', header=None, usecols=[2,3,4,5], skip_blank_lines=True)
log.dropna(how="all", inplace=True)
log.columns = ['date','home','region','city']
log['date'] = pd.to_datetime(log['date']).dt.strftime('%Y-%m-%d')
log['city'] = [(x[1][1]+', ' + x[1][0]) for x in pd.DataFrame(log[['region', 'city']]).iterrows()]
log.to_csv('lifelog_current_clean.csv', index=False)
log_dict = {x[0]:tuple(x[1]) for x in log.iterrows()}
with open('formatted_website/data/current_clean_json.json', 'w') as out_file:
    json.dump(log_dict, out_file)

In [4]:
#GENERATE CITY GEOJSON
#generate counts for city and create dataframe
city_counts = placeCounts('1997-06-01', 'city', log)
del city_counts['n/a']
city_count_list = list(map(lambda x: (x[0], x[1][0], x[1][1]), city_counts.items()))
city_df = pd.DataFrame(city_count_list, columns=['city','days', 'prop'])

#geocode
longitude = []
latitude = []

for city in city_df['city']:
    if 'Georgia' in city:
        city = city.split(',')[0] + ', Georgia (' #because using Georgia(State), Georgia(Country)
    result = geocoder.osm(city)
    longitude.append(result.json['lng'])
    latitude.append(result.json['lat'])
city_df['longitude'] = longitude
city_df['latitude'] = latitude
city_gdf = gpd.GeoDataFrame(city_df, geometry=gpd.points_from_xy(x=city_df['longitude'], y=city_df['latitude']))
city_gdf = city_gdf.drop('longitude',axis=1)
city_gdf = city_gdf.drop('latitude',axis=1)

city_gdf.to_file('for_mapbox/current_points.geojson', driver='GeoJSON')

In [5]:
#GENERATE REGION GEOJSON
#send totals for each area to respective json files
region_counts = placeCounts('1997-06-01', 'region', log)
del region_counts['n/a']
for i in ['region', 'city']:
    with open('formatted_website/data/'+i+'_totals.json', 'w') as file:
        json.dump(eval(i+'_counts'), file)
        
#prepare region data: stay data as well as polygons
region_stay_df = pd.DataFrame([(x[0], x[1][0], x[1][1]) for x in region_counts.items()], 
                         columns = ['region', 'days', 'prop'])
region_stay_df['cities'] = [{y[0].split(', ')[0]:y[1][0] for y in city_counts.items() if y[0].split(', ')[1] == x} 
                            for x in region_stay_df['region']]

#read in polygon data
countries = gpd.read_file('countries.geojson')
countries = countries[['ADMIN', 'geometry']]
countries.columns = ['name', 'geometry']
states = gpd.read_file('us-state-boundaries.geojson')
states = states[['name', 'geometry']]

#resolve the name issues
state_ga_idx = list(states['name']).index('Georgia')
states.at[state_ga_idx, 'name'] = 'Georgia (State)'
country_ga_idx = list(countries['name']).index('Georgia')
countries.at[country_ga_idx, 'name'] = 'Georgia (Country)'
country_mk_idx = list(countries['name']).index('Macedonia')
countries.at[country_mk_idx, 'name'] = 'North Macedonia'
country_rs_idx = list(countries['name']).index('Republic of Serbia')
countries.at[country_rs_idx, 'name'] = 'Serbia'

#get rid of us territories and us, to replace with us polygons
duplicates = ['United States of America']
for i in list(countries['name']):
    for j in list(states['name']):
        if i == j:
            duplicates.append(i)
countries_list = list(countries['name'])
for dup in duplicates:
    country_idx = countries_list.index(dup)
    countries = countries.drop(country_idx)

#combine countries, states, and personal stay data
regions = countries.append(states)
regions = regions.merge(region_stay_df, how='left', left_on='name', right_on='region')
regions = regions.drop('region', axis=1)
regions = regions.fillna(0)

#used to check for region name issues
for j in region_stay_df['region']:
    if j not in list(regions['name']):
        print('uh oh: '+j)
        
#simplify so mapbox works gud and send to file
regions_out = regions
regions_out['geometry'] = regions['geometry'].simplify(tolerance=0.01)
regions_out.to_file('for_mapbox/current_regions.geojson', driver='GeoJSON')

In [6]:
#GENERATE LINE DATA
#keep only necessary columns and dates where there is a move
line_log = log[['city', 'date']]
line_log = line_log.values.tolist()
key_days = [line_log[0]]
for x in range(len(line_log)-2):
    idx = x+1
    if (line_log[idx][0] != line_log[idx-1][0]) or (line_log[idx][0] != line_log[idx+1][0]):
        key_days.append(line_log[idx])
key_days.append(line_log[-1])

#calculate length of duration
line_list = []
for x in range(len(key_days)-1):
    if key_days[x][0] != key_days[x+1][0]:
        save_row = ((key_days[x][0], key_days[x+1][0]), 
                    (datetime.strptime(key_days[x][1],'%Y-%m-%d') - datetime.strptime(key_days[0][1],'%Y-%m-%d')).days, 
                    (key_days[x][1],key_days[x+1][1]))
        line_list.append(save_row)
        
#so locations can be easily indexed
city_df = city_df.set_index('city')

#generate gdf
line_gdf = gpd.GeoDataFrame([x[0] + (x[1],) + x[2] + 
                             (LineString([city_df.loc[x[0][0]]['geometry'],
                                         city_df.loc[x[0][1]]['geometry']]),)
                             for x in line_list], columns = ['start_loc', 'end_loc', 'time_int', 'start_date', 
                                                             'end_date', 'geometry'])
line_gdf.to_file('for_mapbox/current_line.geojson', driver='GeoJSON')

In [7]:
#GENERATE POINTS FOR EACH KEY DAY AND ALL CENTORIDS
city_gdf = city_gdf.set_index('city')

log_list = log[['date', 'city']].values.tolist()
points_days = [(log_list[x][1], 
                (datetime.strptime(log_list[x][0],'%Y-%m-%d') - datetime.strptime(log_list[0][0],'%Y-%m-%d')).days, 
                city_gdf.loc[log_list[x][1]]['geometry'])
              for x in range(len(log_list))]
point_geoseries = [x[2] for x in points_days]#for centroids
points_days = [points_days[0]] + [points_days[x] for x in range(1,len(points_days)-1) \
    if not(points_days[x][0] == points_days[x-1][0] and points_days[x][0] == points_days[x+1][0])] \
    + [points_days[-1]]

#send key points to file
points_days_gdf = gpd.GeoDataFrame(points_days, columns=['city', 'time_int', 'geometry'])
points_days_gdf.to_file('for_mapbox/current_all_points.geojson', driver='GeoJSON')

centroid_list = [point_geoseries[0]]+[MultiPoint(point_geoseries[0:x+2]).centroid for x in range(len(point_geoseries)-1)]
centroid_gdf = gpd.GeoDataFrame(zip(log['date'], centroid_list), columns=['date', 'geometry'])
centroid_gdf.to_file('for_mapbox/current_centroids.geojson', driver='GeoJSON')

In [8]:
#GENERATE STAYS FOR PLACE BAR
stay_log = log[['date','city']]
compressed_list = []
same_count = 0
first_date = stay_log.loc[0]['date']
last_date = ''
current_city = stay_log.loc[0]['city']
for row in stay_log.values.tolist():
    if row[1] == current_city:
        same_count += 1;
        last_date = row[0]
    else:
        compressed_list.append((current_city, same_count, first_date, last_date))
        current_city = row[1]
        first_date = row[0]
        last_date = row[0]
        same_count = 1
compressed_list.append((current_city, same_count, first_date, stay_log.values.tolist()[-1][0]))#add last row
compressed_dict = {x: compressed_list[x] for x in range(len(compressed_list))}
with open('formatted_website/data/current_stays.json', 'w') as out_file:
    json.dump(compressed_dict, out_file)