In [1]:
import json
import pandas as pd
import numpy as np
from argparse import Namespace
from collections import Counter
import pickle
import os
import urllib
import networkx as nx

## Venezia

### Importing Useful Data

In [2]:
GEO_metadata = pd.read_csv('Venezia/data_storage/GEO_metadata.csv', sep='\t', index_col='Unnamed: 0')

In [6]:
GEO_nodes = pd.read_csv('Venezia/data_storage/GEO_nodes.csv', sep='\t').set_index('osmid')
GEO_nodes

Unnamed: 0_level_0,y,x,street_count,pr,bc,dg,cl,highway,geometry
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27178184,45.438197,12.335686,4,0.000197,0.208517,0.001368,0.005999,,POINT (12.3356863 45.4381974)
764403528,45.438128,12.335605,3,0.000175,0.025767,0.001026,0.005985,,POINT (12.3356054 45.4381284)
1675825096,45.438233,12.335728,3,0.000186,0.048636,0.001026,0.005990,,POINT (12.3357281 45.4382329)
8670969688,45.438255,12.335601,3,0.000111,0.207535,0.001026,0.005998,,POINT (12.3356013 45.4382549)
5395065019,45.438053,12.335932,4,0.000236,0.196443,0.001368,0.006001,,POINT (12.3359319 45.4380532)
...,...,...,...,...,...,...,...,...,...
9196476006,45.441902,12.307559,3,0.000119,0.000691,0.001026,0.004071,,POINT (12.3075592 45.4419018)
9196521176,45.438670,12.326683,1,0.000077,0.000000,0.000342,0.005886,,POINT (12.326683 45.4386701)
9221655954,45.467211,12.279144,1,0.000030,0.000000,0.000342,0.001992,,POINT (12.2791441 45.467211)
9379177140,45.432392,12.331032,1,0.000073,0.000000,0.000342,0.005220,,POINT (12.3310316 45.4323915)


In [4]:
social_links = pd.read_csv('Venezia/data_storage/social_links.csv', sep='\t').rename(columns={'Unnamed: 0': 'User0', 
                                                                                              'Unnamed: 1': 'User1'})

In [5]:
social_links

Unnamed: 0,User0,User1,self,friend,interest,mutual
0,47954272@N06,47954272@N06,1,1,1.000000,True
1,47954272@N06,64148082@N02,0,False,0.000000,False
2,47954272@N06,186704588@N07,0,False,0.000000,False
3,47954272@N06,61033692@N00,0,False,0.000000,False
4,47954272@N06,69376724@N06,0,False,0.021277,False
...,...,...,...,...,...,...
108895,85941441@N04,144576996@N06,0,False,0.000000,False
108896,85941441@N04,127547040@N07,0,False,0.000000,False
108897,85941441@N04,87413031@N03,0,False,0.000000,False
108898,85941441@N04,112273443@N02,0,False,0.000000,False


In [11]:
GEO_node_dist = pd.read_csv('Venezia/data_storage/GEO_node_dist.csv', sep='\t', index_col='Unnamed: 0')

In [12]:
GEO_node_dist

Unnamed: 0,7636418620,1851468091,7887753052,4437784469,27231707,1996193041,1863759288,245128972,3586376601,251147479,...,1833539390,248119073,248776130,1920433297,3586674306,944189412,1327380181,944189121,4471751053,271343269
7636418620,0.0,24.0,206.8,213.1,179.3,310.0,202.0,156.1,122.4,143.3,...,77.2,227.7,203.4,214.4,230.4,327.8,372.2,322.1,69.9,178.1
1851468091,24.0,0.0,190.0,190.6,160.3,287.5,179.5,133.6,116.8,120.8,...,71.6,208.7,180.9,191.9,207.9,305.3,380.3,299.6,62.0,155.6
7887753052,206.8,190.0,0.0,43.5,117.3,168.2,32.4,60.4,88.8,75.4,...,154.4,102.0,71.3,78.0,62.0,159.4,570.3,153.7,149.3,53.9
4437784469,213.1,190.6,43.5,0.0,106.1,135.0,11.1,60.5,100.4,76.0,...,166.0,73.2,48.2,45.2,21.7,119.1,570.9,113.4,160.9,71.2
27231707,179.3,160.3,117.3,106.1,0.0,152.3,106.3,60.0,142.3,79.0,...,207.9,49.2,60.9,77.1,116.2,202.9,535.6,197.2,177.4,88.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944189412,327.8,305.3,159.4,119.1,202.9,185.8,127.0,175.2,216.3,190.7,...,281.9,153.7,150.4,132.4,105.4,0.0,685.6,5.7,276.8,187.1
1327380181,366.8,373.3,563.3,563.9,528.6,660.8,552.8,506.9,485.1,494.1,...,439.9,577.0,554.2,565.2,581.2,678.6,0.0,672.9,432.6,528.9
944189121,322.1,299.6,153.7,113.4,197.2,180.1,121.3,169.5,210.6,185.0,...,276.2,148.0,144.7,126.7,99.7,5.7,679.9,0.0,271.1,181.4
4471751053,69.9,62.0,149.3,160.9,177.4,271.3,149.8,117.4,64.9,104.6,...,51.1,195.4,164.7,175.7,179.4,276.8,439.6,271.1,0.0,121.5


## Sparse Matrix Version

In [8]:
import scipy

In [9]:
len(GEO_metadata['GEO_node'].unique())

935

In [10]:
GEO_nodes = GEO_metadata['GEO_node'].unique()

In [78]:
GEO_dict = {GEO_nodes[i]:i for i in range(len(GEO_nodes))}

In [12]:
GEO_node_dist_s = GEO_node_dist[GEO_nodes.astype(str)].loc[GEO_nodes]

In [71]:
GEO_node_dist_sym = (np.array(GEO_node_dist_s) + np.array(GEO_node_dist_s).T)/2

In [72]:
GEO_node_dist_sym

array([[  0. ,  24. , 206.8, ..., 322.1,  69.9, 178.1],
       [ 24. ,   0. , 190. , ..., 299.6,  62. , 155.6],
       [206.8, 190. ,   0. , ..., 153.7, 149.3,  53.9],
       ...,
       [322.1, 299.6, 153.7, ...,   0. , 271.1, 181.4],
       [ 69.9,  62. , 149.3, ..., 271.1,   0. , 121.5],
       [178.1, 155.6,  53.9, ..., 181.4, 121.5,   0. ]])

In [74]:
SPA_dense =  ((20 - GEO_node_dist_sym)/20 * ((GEO_node_dist_sym<20).astype(int)))

In [75]:
SPA_sparse = scipy.sparse.csr_matrix(SPA_dense)

In [76]:
SPA_sparse.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [16]:
owners = GEO_metadata['owner'].unique()

In [17]:
social_links['relationship'] = (((social_links['self'].astype(int)*2 + (social_links['friend']=='True').astype(int) + (social_links['interest']>0.05).astype(int))).astype(int))/3

In [18]:
social_links['relationship'].value_counts()

0.000000    106358
0.333333      2126
1.000000       330
0.666667        86
Name: relationship, dtype: int64

In [19]:
social_links = social_links[social_links['relationship']>0]

In [20]:
user_dict = {owners[i]:i for i in range(len(owners))}

In [21]:
row = np.array(social_links['User0'].apply(lambda x: user_dict[x]))
col = np.array(social_links['User1'].apply(lambda x: user_dict[x]))
data = np.array(social_links['relationship'])

In [22]:
SOC_sparse = scipy.sparse.coo_matrix((data, (row, col)), shape=(len(owners), len(owners))).tocsr()

In [23]:
SOC_sparse.toarray()

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.33333333,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.33333333, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [24]:
times = sorted(GEO_metadata['adjusted_week'].unique())

In [25]:
time_dict = {times[i]:i for i in range(len(times))}

In [26]:
from scipy.sparse import diags

In [27]:
TEM_sparse = diags([0.5, 1, 0.5], [-1, 0, 1], shape=(len(times), len(times))).tocsr()

In [28]:
TEM_sparse.toarray()

array([[1. , 0.5, 0. , ..., 0. , 0. , 0. ],
       [0.5, 1. , 0.5, ..., 0. , 0. , 0. ],
       [0. , 0.5, 1. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 1. , 0.5, 0. ],
       [0. , 0. , 0. , ..., 0.5, 1. , 0.5],
       [0. , 0. , 0. , ..., 0. , 0.5, 1. ]])

In [79]:
SPA = GEO_metadata['GEO_node'].apply(lambda x: GEO_dict[x])
SPA

0         0
1         0
2         1
3         2
4         3
       ... 
2946    278
2947    461
2948    459
2949    932
2950    930
Name: GEO_node, Length: 2951, dtype: int64

In [80]:
SPA_enc = scipy.sparse.coo_matrix((np.ones(len(SPA)), (np.array(SPA), range(len(SPA)))), shape=(len(GEO_nodes), len(SPA))).tocsr()

In [31]:
SPA_enc.toarray()

array([[1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
SOC = GEO_metadata['owner'].apply(lambda x: user_dict[x])
SOC

0         0
1         0
2         0
3         1
4         2
       ... 
2946    136
2947    136
2948    136
2949    136
2950    136
Name: owner, Length: 2951, dtype: int64

In [33]:
SOC_enc = scipy.sparse.coo_matrix((np.ones(len(SOC)), (np.array(SOC), range(len(SOC)))), shape=(len(owners), len(SOC))).tocsr()

In [34]:
TEM = GEO_metadata['adjusted_week'].apply(lambda x: time_dict[x])
TEM

0       254
1       254
2       254
3       164
4       268
       ... 
2946    234
2947    234
2948    234
2949    234
2950    234
Name: adjusted_week, Length: 2951, dtype: int64

In [35]:
TEM_enc = scipy.sparse.coo_matrix((np.ones(len(TEM)), (np.array(TEM), range(len(TEM)))), shape=(len(times), len(TEM))).tocsr()

In [81]:
A_SPA = SPA_enc.T * SPA_sparse * SPA_enc

In [82]:
A_SPA

<2951x2951 sparse matrix of type '<class 'numpy.float64'>'
	with 445779 stored elements in Compressed Sparse Column format>

In [83]:
A_SPA.toarray()

array([[1.   , 1.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [1.   , 1.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   , ..., 0.   , 0.   , 0.   ],
       ...,
       [0.   , 0.   , 0.   , ..., 1.   , 0.3  , 0.405],
       [0.   , 0.   , 0.   , ..., 0.3  , 1.   , 0.715],
       [0.   , 0.   , 0.   , ..., 0.405, 0.715, 1.   ]])

In [39]:
A_SOC = SOC_enc.T * SOC_sparse * SOC_enc

In [89]:
A_SOC

<2951x2951 sparse matrix of type '<class 'numpy.float64'>'
	with 488103 stored elements in Compressed Sparse Column format>

In [41]:
A_TEM = TEM_enc.T * TEM_sparse * TEM_enc

In [42]:
A_TEM

<2951x2951 sparse matrix of type '<class 'numpy.float64'>'
	with 501191 stored elements in Compressed Sparse Column format>

In [85]:
A_simp = (((A_SOC>0) + (A_TEM>0) + (A_SPA>0))>0).astype(int)

In [86]:
A_simp

<2951x2951 sparse matrix of type '<class 'numpy.int64'>'
	with 1071977 stored elements in Compressed Sparse Column format>

In [49]:
A_simp.toarray()

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1]])

In [90]:
scipy.sparse.save_npz('dataset_np/Venice/A_SOC.npz', A_SOC)

In [91]:
scipy.sparse.save_npz('dataset_np/Venice/A_SPA.npz', A_SPA)

In [92]:
scipy.sparse.save_npz('dataset_np/Venice/A_TEM.npz', A_TEM)

In [93]:
scipy.sparse.save_npz('dataset_np/Venice/A_simp.npz', A_simp)