In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(context='notebook', font='Fira Sans', style='white', palette='plasma')

In [2]:
import pandas as pd
import geopandas as gpd

In [4]:
checkins = pd.read_csv('output/relevant_check_ins.csv.gz')
checkins.head()

Unnamed: 0.1,Unnamed: 0,user_id,venue_id,datetime,utc_offset
0,130,1597907,4d9b7366b2aaa093259f7082,Tue Apr 03 18:00:43 +0000 2012,120
1,278,106998,4adcda60f964a520934421e3,Tue Apr 03 18:01:23 +0000 2012,120
2,377,1101077,4bac7462f964a520d4f53ae3,Tue Apr 03 18:01:47 +0000 2012,120
3,382,1218854,4adcda60f964a520934421e3,Tue Apr 03 18:01:49 +0000 2012,120
4,433,1648478,4dfc7683227185f38b94d41d,Tue Apr 03 18:02:02 +0000 2012,120


In [5]:
venues = gpd.read_file('output/relevant_pois.json')
venues.head()

Unnamed: 0,venue_id,category,index_right,geometry
0,4ad0e340f964a520c7da20e3,Southern / Soul Food Restaurant,2192,POINT (2.15161 41.39223)
1,4adcda4bf964a520873f21e3,Hotel,2192,POINT (2.15006 41.39436)
2,4adcda4cf964a520903f21e3,Hotel,2192,POINT (2.15351 41.39404)
3,4adcda4df964a520084021e3,Cocktail Bar,2192,POINT (2.15427 41.39277)
4,4adcda4df964a520164021e3,Beer Garden,2192,POINT (2.15316 41.39503)


In [7]:
checkins_grid = checkins.merge(venues[['venue_id', 'index_right']]).sort_values('user_id').drop('utc_offset', axis=1)
checkins_grid.head()

Unnamed: 0.1,Unnamed: 0,user_id,venue_id,datetime,index_right
169362,321008,11,4bf24abb99d02d7feb0eca48,Wed Feb 27 19:04:51 +0000 2013,2177
169387,337406,19,512bb246e4b0447bba28bce6,Wed Feb 27 20:49:55 +0000 2013,2257
55499,369298,19,4b76f383f964a520ce6d2ee3,Thu Feb 28 00:23:42 +0000 2013,2236
124467,369286,19,4bcc52eb3740b713efc46365,Mon Feb 25 08:10:09 +0000 2013,2699
43811,36624,19,4b241c53f964a5204e6124e3,Mon Feb 25 20:05:33 +0000 2013,2236


In [9]:
def shift(df):
    origin = df.rename({'venue_id': 'origin'}, axis=1)[['origin', 'user_id']]
    destination = df.rename({'venue_id': 'destination', 'user_id': 'user_id_d'}, axis=1)[['destination', 'user_id_d']].shift()
    trips = (origin.join(destination)
             .dropna()
             .pipe(lambda x: x[x.user_id == x.user_id_d])
             .groupby(['user_id', 'origin', 'destination'])
             .size()
            )#.dropna().groupby(['origin', 'destination']).size()
    trips.name = 'n_trips'
    return trips#.reset_index()

shift(checkins_grid.head(100))

user_id  origin                    destination             
19       4adcda60f964a520994421e3  4b241c53f964a5204e6124e3    1
         4b241c53f964a5204e6124e3  4bcc52eb3740b713efc46365    1
         4b76f383f964a520ce6d2ee3  512bb246e4b0447bba28bce6    1
         4b7980a2f964a5202bfd2ee3  4adcda60f964a520994421e3    1
         4bcc52eb3740b713efc46365  4b76f383f964a520ce6d2ee3    1
                                                              ..
1180     4b80be31f964a5209a8930e3  4adcda50f964a520544121e3    1
1287     4adcda4df964a520174021e3  4cea74b8e888f04d743b4d6b    1
         4ba8b467f964a5206ce839e3  4cea74b8e888f04d743b4d6b    1
         4cea74b8e888f04d743b4d6b  4ba8b467f964a5206ce839e3    1
1379     4c34c4b6213c2d7f680b395d  4f98899de4b0d3232985d4b5    1
Name: n_trips, Length: 71, dtype: int64

In [11]:
user_trip_counts = shift(checkins_grid).reset_index()
user_trip_counts.head()

Unnamed: 0,user_id,origin,destination,n_trips
0,19,4adcda60f964a520994421e3,4b241c53f964a5204e6124e3,1
1,19,4b241c53f964a5204e6124e3,4bcc52eb3740b713efc46365,1
2,19,4b76f383f964a520ce6d2ee3,512bb246e4b0447bba28bce6,1
3,19,4b7980a2f964a5202bfd2ee3,4adcda60f964a520994421e3,1
4,19,4bcc52eb3740b713efc46365,4b76f383f964a520ce6d2ee3,1


In [12]:
user_trip_counts.shape

(129340, 4)

In [14]:
user_trips_grid = (user_trip_counts
.join(venues[['venue_id', 'index_right']].set_index('venue_id'), on='origin')
.rename({'index_right': 'origin_cell_id'}, axis=1)
.join(venues[['venue_id', 'index_right']].set_index('venue_id'), on='destination')
.rename({'index_right': 'destination_cell_id'}, axis=1))
user_trips_grid.head()

Unnamed: 0,user_id,origin,destination,n_trips,origin_cell_id,destination_cell_id
0,19,4adcda60f964a520994421e3,4b241c53f964a5204e6124e3,1,2176,2236
1,19,4b241c53f964a5204e6124e3,4bcc52eb3740b713efc46365,1,2236,2699
2,19,4b76f383f964a520ce6d2ee3,512bb246e4b0447bba28bce6,1,2236,2257
3,19,4b7980a2f964a5202bfd2ee3,4adcda60f964a520994421e3,1,2910,2176
4,19,4bcc52eb3740b713efc46365,4b76f383f964a520ce6d2ee3,1,2699,2236


In [17]:
flows = (user_trips_grid
         [user_trips_grid.origin_cell_id != user_trips_grid.destination_cell_id]
         .groupby(['origin_cell_id', 'destination_cell_id'])
         ['n_trips']
         .sum()
         .reset_index()
         .pipe(lambda x: x[x.n_trips > 5])
         .rename({'origin_cell_id': 'origin', 'destination_cell_id': 'dest', 'n_trips': 'count'}, axis=1)
        )
flows.head()

Unnamed: 0,origin,dest,count
468,45,48,6
559,45,2821,7
562,45,2857,7
588,46,45,7
806,68,476,9


In [19]:
flows['count'].describe()

count    4712.000000
mean       17.432725
std        44.162166
min         6.000000
25%         7.000000
50%         9.000000
75%        15.000000
max      1673.000000
Name: count, dtype: float64

In [32]:
flows.to_csv('output/flow_magnitudes.csv', index=False)

In [27]:
grid = gpd.read_file('output/relevant_grid.geo.json')
grid.head()

Unnamed: 0,s2_cellid,index_right,geometry
0,1343054926502166528,9,"POLYGON ((1.97837 41.25890, 1.98326 41.25882, ..."
1,1343054935092101120,9,"POLYGON ((1.97347 41.25899, 1.97837 41.25890, ..."
2,1343054960861904896,9,"POLYGON ((1.96858 41.25907, 1.97347 41.25899, ..."
3,1343054969451839488,9,"POLYGON ((1.96369 41.25915, 1.96858 41.25907, ..."
4,1343054978041774080,9,"POLYGON ((1.95880 41.25923, 1.96369 41.25915, ..."


In [36]:
flow_grid = grid[grid.index.isin(flows.origin) | grid.index.isin(flows.dest)]
flow_grid.shape, grid.shape

((587, 3), (3846, 3))

In [41]:
centroids = pd.DataFrame({'x': flow_grid.geometry.centroid.x, 'y': flow_grid.geometry.centroid.y})

In [42]:
centroids.index.name = 'id'

In [43]:
centroids['name'] = centroids.index.values

In [44]:
centroids[['name', 'y', 'x']].rename({'x': 'lon', 'y': 'lat'}, axis=1).to_csv('output/flow_locations.csv')