In [1]:
import numpy as np
from scipy.io import loadmat
from tqdm import tqdm
from scipy.io import savemat

In [2]:
data_names = ['hongzhi', 'Istanbul', 'Jakarta', 'KualaLampur', 'NYC', 'SaoPaulo', 'TKY']
data_inprefix = 'dataset/cleaned_'
data_insufix = '.mat'

data_outprefix = 'input/'
data_outsufix = '_friendPOI.csv'

In [10]:
locations = set()
for ele in data_names:
    in_path = '{}{}{}'.format(data_inprefix, ele, data_insufix)
    print(in_path)
    in_data = loadmat(in_path)
    old_friends = in_data['friendship_old']
    selected_checkins = in_data['selected_checkins']
    new_friends0 = selected_checkins[:,0].reshape(-1, 1)
    new_friends1 = selected_checkins[:,2].reshape(-1, 1)
    location = np.unique(new_friends1)
    
    new_location = [max(np.max(old_friends), np.max(new_friends0)) + i + 1 for i in range(len(location))]
    new_location = np.array(new_location)
    location_dict = {location[i]: new_location[i] for i in range(len(location))}
    new_friends1 = np.array([location_dict[new_friends1[i][0]] for i in range(len(new_friends1))]).reshape(-1, 1)
    
    new_friends = np.concatenate((new_friends0, new_friends1), axis=1)
    print("before: {}".format(len(new_friends)))
    new_friends = np.unique(new_friends, axis=0)
    print("after: {}".format(len(new_friends)))
    
    with open('{}location_{}'.format(data_outprefix, ele), 'w', encoding='utf-8') as file:
        for elee in new_location:
            file.write('{}\n'.format(elee))
    print(old_friends.shape, new_friends.shape)
    final_friends = np.concatenate((old_friends, new_friends))
    print("Num location: {}, num friend: {}".format(len(new_location), len(final_friends)))
    
    with open('{}{}{}'.format(data_outprefix, ele, data_outsufix), 'w', encoding='utf-8') as file:
        file.write('id1,id2\n')
        for i in range(len(final_friends)):
            this = final_friends[i]
            source = this[0]
            target = this[1]
            file.write('{},{}\n'.format(source, target))
            
    with open('Suhi_output/location_dict_{}'.format(ele), 'w', encoding='utf-8') as file:
        for key, value in location_dict.items():
            file.write('{}\t{}\n'.format(key, value))

dataset/cleaned_hongzhi.mat
before: 255557
after: 105327
(7316, 2) (105327, 2)
Num location: 13105, num friend: 112643
dataset/cleaned_Istanbul.mat
before: 908162
after: 399689
(21354, 2) (399689, 2)
Num location: 12693, num friend: 421043
dataset/cleaned_Jakarta.mat
before: 378559
after: 152917
(11207, 2) (152917, 2)
Num location: 8826, num friend: 164124
dataset/cleaned_KualaLampur.mat
before: 526405
after: 233500
(16161, 2) (233500, 2)
Num location: 10817, num friend: 249661
dataset/cleaned_NYC.mat
before: 105961
after: 48840
(8723, 2) (48840, 2)
Num location: 3628, num friend: 57563
dataset/cleaned_SaoPaulo.mat
before: 249839
after: 74443
(9655, 2) (74443, 2)
Num location: 6286, num friend: 84098
dataset/cleaned_TKY.mat
before: 699324
after: 245275
(37480, 2) (245275, 2)
Num location: 10856, num friend: 282755


In [6]:
NYC_data_path = 'dataset/dataset_connected_NYC.mat'

In [7]:
data = loadmat(NYC_data_path)

In [8]:
selected_checkins = data['selected_checkins']

In [9]:
selected_checkins

array([[   1801,      39,   17807,      70],
       [    105,      39,    5708,      33],
       [    571,      39,    3968,     300],
       ...,
       [    718,      59,   12596,      70],
       [   1446,      59,  104201,     279],
       [   2305,      60, 1692889,     190]], dtype=int32)

In [5]:
old_friendship = data['friendship_old']
new_friendship = data['friendship_new']
friendship = np.concatenate((old_friendship, new_friendship), axis=0)

In [6]:
friendship

array([[   1,    2],
       [   1,   24],
       [   1,   30],
       ...,
       [3959, 3965],
       [3970, 4000],
       [3981, 3991]], dtype=uint16)

In [9]:
def colocation_friendship_statistics(friends, checkins):
    def get_checkin_from_user(user, checkins):
        locations = set()
        for i in range(len(checkins)):
            if checkins[i][0] == user:
                locations.add(checkins[i][2])
        return locations
    
    def jaccard_similarity(set1, set2):
        inter = set1.intersection(set2)
        uni = set1.union(set2)
        if len(uni) == 0:
            return 0
        return len(inter) / len(uni)
    
    simis = []
    
    for i in tqdm(range(len(friends))):
        source, target = friends[i][0], friends[i][1]
        source_checkins = get_checkin_from_user(source, checkins)
        target_checkins = get_checkin_from_user(target, checkins)        
        simis.append(jaccard_similarity(source_checkins, target_checkins))
    return np.mean(simis)

In [10]:
colocation_friendship_statistics(friendship, selected_checkins)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19268/19268 [56:47<00:00,  5.65it/s]


0.02626099581260058

In [11]:
map_dict = 'Suhi_output/ego_net_NYC.txt'
edge_list = 'Suhi_output/edgelist_NYC'

In [12]:
maps = dict()
with open(map_dict, 'r', encoding='utf-8') as file:
    for line in file:
        data_line = line.strip().split(',')
        maps[data_line[0]] = data_line[1]


In [45]:
user_checkins_dict = dict()
for i in range(len(selected_checkins)):
    user = selected_checkins[i][0]
    venue = selected_checkins[i][2]
    if user not in user_checkins_dict:
        user_checkins_dict[user] = set([venue])
    else:
        user_checkins_dict[user].add(venue)

In [46]:
len(user_checkins_dict.keys())

3785

In [47]:
max(list(user_checkins_dict))

4024

In [15]:
import networkx as nx
ego_net =nx.read_edgelist(edge_list)

In [48]:
def get_checkin_from_user(user, checkins):
    try:
        return user_checkins_dict[int(user)]
    except:
#         print(user)
        return set()

def jaccard_similarity(set1, set2):
    inter = set1.intersection(set2)
    uni = set1.union(set2)
    if len(uni) == 0:
        return 0
    return len(inter) / len(uni)


def colocation_friendship_statistics_for_ego(ego_net, checkins, maps):
    scores = []
    count = 0
    for node in tqdm(ego_net.nodes()):
        neighbors = ego_net.neighbors(node)
#         print(neighbors)
        if len(neighbors) > 1:
            for i in range(len(neighbors) - 1):
                for j in range(i + 1, len(neighbors)):
                    if ego_net.has_edge(neighbors[i], neighbors[j]):
                        node_i = int(maps[neighbors[i]])
                        node_j = int(maps[neighbors[j]])
                        location_i = get_checkin_from_user(node_i, checkins)
                        location_j = get_checkin_from_user(node_j, checkins)
                        score = jaccard_similarity(location_i, location_j)
                        scores.append(score)
        count += 1
        
    return np.mean(scores)

In [49]:
colocation_friendship_statistics_for_ego(ego_net, selected_checkins, maps)




  0%|                                                                                                                                                              | 0/11412 [00:00<?, ?it/s][A[A[A


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11412/11412 [00:00<00:00, 70632.73it/s][A[A[A


0.026604663944231395

In [2]:
friendship_old = np.load('CA Dataset/old_friendship_new.npy')
friendship_new = np.load('CA Dataset/new_friendship_new.npy')
selected_checkins = np.load('CA Dataset/selected_checkins_new.npy')

dictt = {'friendship_old': friendship_old, 'friendship_new': friendship_new, 'selected_checkins': selected_checkins}

In [3]:
savemat('dataset/dataset_connected_hongzhi.mat', dictt)