In [2]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm


In [None]:
checkins = pd.read_csv('dataset_WWW_Checkins_anonymized.txt', sep='\t', header=None, names=['user_id', 'venue_id', 'utc', 'timestamp'])

In [12]:
checkins = checkins[['user_id', 'venue_id']]    

In [13]:
checkins

Unnamed: 0,user_id,venue_id
0,822121,4b4b87b5f964a5204a9f26e3
1,208842,4b4606f2f964a520751426e3
2,113817,4b4bade2f964a520cfa326e3
3,14732,4c143cada5eb76b0dc7dc1b7
4,1397630,4e88cf4ed22d53877981fdab
...,...,...
22809619,1568872,502bb1cee4b0190fdc74767b
22809620,689856,4d701b20b73bb1f70867b472
22809621,567861,4b51b5ecf964a5203e5327e3
22809622,44698,504a21c3e4b0568d3c5e6794


In [14]:
raw_checkins = pd.read_csv('foursquare/raw_POIs.txt', sep='\t', header=None, names=['venue_id', 'Longitude', 'Latitude', 'Category', 'Country'])

In [15]:
raw_checkins = raw_checkins[['venue_id', 'Longitude', 'Latitude']]

In [16]:
# 使用 merge 进行列合并，以 checkins 为主表，根据 venue_id 进行左连接
merged_data = checkins.merge(raw_checkins, on='venue_id', how='left')

In [17]:
merged_data

Unnamed: 0,user_id,venue_id,Longitude,Latitude
0,822121,4b4b87b5f964a5204a9f26e3,41.029717,28.974420
1,208842,4b4606f2f964a520751426e3,30.270786,-97.753153
2,113817,4b4bade2f964a520cfa326e3,40.436712,-79.990132
3,14732,4c143cada5eb76b0dc7dc1b7,31.188807,-81.376461
4,1397630,4e88cf4ed22d53877981fdab,19.399745,-99.102595
...,...,...,...,...
22809619,1568872,502bb1cee4b0190fdc74767b,13.781635,100.574026
22809620,689856,4d701b20b73bb1f70867b472,-6.459068,-37.097998
22809621,567861,4b51b5ecf964a5203e5327e3,-23.533691,-46.634529
22809622,44698,504a21c3e4b0568d3c5e6794,21.165096,-86.824743


In [19]:
# 保存合并后的数据到 CSV 文件
merged_data.to_csv('processed_foursquare.csv', index=False)

In [20]:
# 另一种方式：使用 value_counts
checkin_counts = merged_data[['user_id', 'venue_id']].value_counts().reset_index(name='checkin_count')

In [21]:
checkin_counts

Unnamed: 0,user_id,venue_id,checkin_count
0,424779,4b46d195f964a5209e2826e3,657
1,1869114,4d519fa89b27721e14b2b946,633
2,447575,4d5f52f0039188bff0d57123,581
3,1401236,4f7acb1fe4b03318f5ea2e36,521
4,1098387,4efea9f4490182a1a699288e,493
...,...,...,...
10549668,561725,4c39e8fcdfb0e21ec225b1a8,1
10549669,561725,4c091fdb6071a593ed4fdd32,1
10549670,561725,4bfae024bbb7c928b5600743,1
10549671,561725,4bf7026c5317a5939a43fd7e,1


In [22]:
# 保存结果到CSV文件
checkin_counts.to_csv('user_venue_checkin_counts.csv', index=False)

In [None]:
for l in [1, 5, 10, 15, 20]:
    # 从checkin_counts中选择l个场所ID
    rows = np.arange(l)
    selected_venue_ids = checkin_counts[['venue_id']].iloc[rows]
    
    # 从merged_data中提取这些场所ID对应的经纬度信息
    venue_info = merged_data[['Longitude', 'Latitude', 'venue_id']]
    selected_venues = venue_info[venue_info['venue_id'].isin(selected_venue_ids['venue_id'])]
    
    # 去重（因为一个场所可能被多个用户签到过）
    selected_venues = selected_venues.drop_duplicates(subset=['venue_id'])
    
    # 保存到文件
    selected_venues[['Longitude', 'Latitude', 'venue_id']].to_csv(f'../MLBSN/foursquare/foursquare_text_{l}.txt', 
                                                                  index=False, header=False, sep='\t')

In [28]:
merged_data[['Longitude', 'Latitude', 'venue_id']].iloc[[2, 5 ,3]]

Unnamed: 0,Longitude,Latitude,venue_id
2,40.436712,-79.990132,4b4bade2f964a520cfa326e3
5,52.405847,16.99497,4e298ed0c65ba11f4c67a615
3,31.188807,-81.376461,4c143cada5eb76b0dc7dc1b7
