In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from datetime import datetime, timedelta
import math
from keras.layers import Embedding, Dense, Input, Concatenate, BatchNormalization, Layer, Dropout
from keras.models import Model
import keras.backend as K
import tensorflow as tf

np.random.seed(555)

**数据处理**

数据集来源为FourSquare - NYC and Tokyo Check-ins

选择dataset_TSMC2014_NYC.csv中打卡数最多的100个用户和500个地点作为数据集。

通过时间戳和偏移时间得到正确时间，并归类于用户在一周中第几天在某地Check-in，将时间信息记录在df[’dayweek‘]中，范围为0-6（周一-周天）。之后对数据进行分组，将用户和时间作为分组基准，在每个组中对包含地点的Check-in次数进行计数。在提取出隐式特征后，对次数进行排序并对其二值化（将序列前一半对应地点为1，后一半为0），

以‘uid’和’dayweek‘作为基准来划分训练集和测试集，其中训练集占比为0.8，之后进行训练得到结果

In [2]:
#
def get_ids(k_data,n):
    k_dict=dict()
    for k in k_data:
        if k_dict.get(k)==None:
            k_dict[k]=0
        k_dict[k]+=1
    rank=sorted(k_dict.items(),key=lambda kv:kv[1],reverse=True)
    k_ids=[]
    for i in range(min(n,len(rank))):
        k_ids.append(rank[i][0])
    return k_ids

In [3]:
def date_convert(date_to_convert):
    pru = datetime.datetime.strptime(date_to_convert, '%a %b %d %H:%M:%S +0000 %Y')+ datetime.timedelta(hours=-240//60)
    return pru.strftime('%b %d %Y %H:%M:%S')

def min2hour(m):
    return  timedelta(hours=m/60)

In [4]:
class DataProcess:
    def __init__(self, data_path, n_user=100, n_venue=500, n_venuecat=40, alpha=0.8):
        print("start load data")
        custom_date_parser = lambda x: datetime.strptime(x, "%a %b %d %H:%M:%S +0000 %Y")
        df = pd.read_csv(data_path,parse_dates=['utcTimestamp'],date_parser=custom_date_parser)
        print("start process data")
        df["realtime"] = df['utcTimestamp'] + df['timezoneOffset'].apply(min2hour)
        df["hour"] = df["realtime"].dt.hour 
        #df["work"] = df['hour'].apply(lambda x: 'Not' if (17 <= x < 24) or (0 <= x < 9)else 'Yes')
        df["dayweek"] = df["realtime"].dt.dayofweek 
        #df["workday"] = df['dayweek'].apply(lambda x: 'True' if x <= 5 else 'False')
        
        user_ids=get_ids(df['userId'].values,n_user)
        df = df[(df['userId'].isin(user_ids))]
        venue_ids=get_ids(df['venueId'].values,n_venue)
        df = df[(df['venueId'].isin(venue_ids))]
        venuecat_ids=get_ids(df['venueCategoryId'].values,n_venuecat)
        df = df[(df['venueCategoryId'].isin(venuecat_ids))]
        user_ids=get_ids(df['userId'].values,n_user)
        venue_ids=get_ids(df['venueId'].values,n_venue)
        venuecat_ids=get_ids(df['venueCategoryId'].values,n_venuecat)
        
        self.n_user=len(user_ids)
        self.n_venue=len(venue_ids)
        self.n_venuecat=len(venuecat_ids)
        print("Number of users = {} , Number of venues = {} , Number of venuecategorys = {}".format(self.n_user,self.n_venue,self.n_venuecat))
        
        u_id2idx = dict(zip(user_ids, range(self.n_user)))
        v_id2idx = dict(zip(venue_ids, range(self.n_venue)))
        vc_id2idx = dict(zip(venuecat_ids, range(self.n_venuecat)))
        
        df = df[(df['userId'].isin(user_ids))&(df['venueId'].isin(venue_ids))&(df['venueCategoryId'].isin(venuecat_ids))]
        df = df.assign(userId=[u_id2idx[uid] for uid in df.userId])
        df = df.assign(venueId=[v_id2idx[vid] for vid in df.venueId])
        df = df.assign(venueCategoryId=[vc_id2idx[vcid] for vcid in df.venueCategoryId])
        headers = ['userId','venueId','venueCategoryId','hour','dayweek']
        df = pd.DataFrame(df,columns = headers)
        df = df.reset_index(drop=True)
        self.df = df
        self.v2vc=dict()
        for i in range(len(df)):
            self.v2vc[df.loc[i,'venueId']]=df.loc[i,'venueCategoryId']

# FM model

In [5]:
class bi_interaction_pooling(Layer):
    def __init__(self):
        super(bi_interaction_pooling, self).__init__()

    def call(self, inputs):
        x = inputs
        x = 0.5 * (tf.square(tf.reduce_sum(x, axis=1)) - tf.reduce_sum(tf.square(x), axis=1))
        return x

def NFM(feature_dict,embedding_size=32,num_hidden_layers=1,if_bn=False,if_dropout=True,dropout_factor=[0.5]):
    inputs_dict = {}
    for fea in feature_dict:
        inputs_dict[fea] = Input(shape=(1,), name=fea)
    embedded = []
    for fea in inputs_dict:
        embedded.append(Embedding(feature_dict[fea], embedding_size)(inputs_dict[fea]))
    x = Concatenate(axis=1)(embedded)
    x = bi_interaction_pooling()(x)
    if if_bn:
        x = BatchNormalization()(x)
    for i in range(num_hidden_layers):
        x = Dense(32, activation='relu')(x)
        if if_dropout:
            x = Dropout(dropout_factor[i])(x)
    output = Dense(1, activation='sigmoid')(x)
    return Model(inputs_dict.values(), output)


# Train & Predict

In [6]:
data_path="../input/foursquare-nyc-and-tokyo-checkin-dataset/dataset_TSMC2014_NYC.csv"
dp = DataProcess(data_path)

start load data
start process data
Number of users = 100 , Number of venues = 392 , Number of venuecategorys = 40


In [7]:
user_day_venue_mx=np.zeros((dp.n_user*7,dp.n_venue))
for i in range(len(dp.df)):
    u=dp.df.loc[i,'userId']
    d=dp.df.loc[i,'dayweek']
    v=dp.df.loc[i,'venueId']
    user_day_venue_mx[u*7+d][v]+=1
user_day_venue_mx=user_day_venue_mx[np.sum(user_day_venue_mx,axis=-1)>10]

In [8]:
alpha=0.8
train_data=[]
test_data=[]
for i in range(user_day_venue_mx.shape[0]):
    if np.random.rand()<alpha:
        for j in range(user_day_venue_mx.shape[1]):
            train_data.append([i//7,j,dp.v2vc[j],i&7,user_day_venue_mx[i][j]])
    else:
        for j in range(user_day_venue_mx.shape[1]):
            test_data.append([i//7,j,dp.v2vc[j],i&7,user_day_venue_mx[i][j]])
train_data=np.array(train_data)
test_data=np.array(test_data)

In [9]:
df_train=pd.DataFrame(train_data,columns=['uid','vid','vcid','day','times'])
df_test=pd.DataFrame(test_data,columns=['uid','vid','vcid','day','times'])
df_train=df_train[df_train['times']>0]
df_test=df_test[df_test['times']>0]

In [10]:
df=pd.concat([df_train,df_test])
df = df.reset_index(drop=True)
feature_dict={}
feature_dict['uid']=len(set(df.uid.values))
feature_dict['vid']=len(set(df.vid.values))
feature_dict['vcid']=len(set(df.vcid.values))
feature_dict['day']=len(set(df.day.values))

In [11]:
t=df.loc[0,'uid']*7+df.loc[0,'day']
venue_dict=dict()
venue_dict[df.loc[0,'vid']]=df.loc[0,'times']
cnt=0
for i in range(1,len(df)):
    u=df.loc[i,'uid']
    d=df.loc[i,'day']
    v=df.loc[i,'vid']
    t_=u*7+d
    if t!=t_:
        rank=sorted(venue_dict.items(),key=lambda kv:kv[1],reverse=True)
        m=min((len(rank)+1)//2,10)
        for k in range(len(rank)):
            if k<m:
                venue_dict[rank[k][0]]=1
            else:
                venue_dict[rank[k][0]]=0
        for j in range(cnt,i):
            df.loc[j,'times']=venue_dict[df.loc[j,'vid']]
        t=t_
        cnt=i
        venue_dict=dict()
    venue_dict[v]=df.loc[i,'times']
rank=sorted(venue_dict.items(),key=lambda kv:kv[1],reverse=True)
m=min((len(rank)+1)//2,10)
for k in range(len(rank)):
    if k<m:
        venue_dict[rank[k][0]]=1
    else:
        venue_dict[rank[k][0]]=0
for j in range(cnt,len(df)):
    df.loc[j,'times']=venue_dict[df.loc[j,'vid']]

In [12]:
df

Unnamed: 0,uid,vid,vcid,day,times
0,0.0,3.0,9.0,0.0,1.0
1,0.0,4.0,7.0,0.0,0.0
2,0.0,7.0,20.0,0.0,0.0
3,0.0,16.0,1.0,0.0,1.0
4,0.0,18.0,11.0,0.0,1.0
...,...,...,...,...,...
3646,81.0,171.0,28.0,7.0,0.0
3647,81.0,217.0,8.0,7.0,1.0
3648,81.0,323.0,15.0,7.0,1.0
3649,81.0,209.0,5.0,0.0,0.0


In [13]:
df_train = df.iloc[:len(df_train),:]
df_test = df.iloc[len(df_train):,:]

In [14]:
X1 = df_train['uid'].values
X2 = df_train['vid'].values
X3 = df_train['vcid'].values
X4 = df_train['day'].values
X_train = [X1,X2,X3,X4]
Y_train = df_train['times'].values
X1 = df_test['uid'].values
X2 = df_test['vid'].values
X3 = df_test['vcid'].values
X4 = df_test['day'].values
X_test = [X1,X2,X3,X4]
Y_test = df_test['times'].values

In [15]:
md = NFM(feature_dict)
md.compile(optimizer='adam', loss='mse')
# 模型训练
md.fit(x=X_train, y=Y_train, epochs=10, batch_size=64)
# 模型预测

2022-07-03 14:46:58.302560: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-07-03 14:46:58.559308: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9ab7ae1050>

In [16]:
def get_list(p,r):
    p_=[]
    r_=[]
    for i in range(len(p)):
        p_.append(p[i][0])
        r_.append(r[i][0])
    return p_,r_
def Recall(p,r):
    num=0
    for i in range(len(p)):
        if p[i] in r:
            num+=1
    return num/len(p)

In [17]:
def AP(p,r,k):
    sum_ap=num=0
    n=len(p)
    for i in range(k):
        if p[i] in r:
            num+=1
            sum_ap+=num/(i+1)
    return sum_ap/(min(k,n))

In [18]:
def nDCG(p,r,k):
    sum_dcg=0
    sum_idcg=0
    for i in range(k):
        sum_idcg+=1/math.log(i+2,2)
        if p[i] in r:
            sum_dcg+=1/math.log(i+2,2)
    return sum_dcg/sum_idcg    

In [19]:
X_test

[array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,
         2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  5.,  5.,  5.,  5.,  5.,
         5.,  5.,  5.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,
         6.,  6.,  8.,  8.,  8.,  8.,  8.,  9.,  9.,  9.,  9.,  9.,  9.,
         9.,  9.,  9.,  9.,  9.,  9.,  9., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 11.,
        11., 11., 11., 11., 11., 11., 11., 12., 12., 12., 12., 12., 12.,
        12., 12., 12., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13.,
        13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 13., 14., 14.,
        14., 14., 14., 14., 14., 14., 14., 14., 15.

In [20]:
Y_predict=md.predict(x=X_test)
n=10
m=len(Y_predict)
t=X_test[0][0]*7+X_test[-1][0]
cnt=0
map5=0
map10=0
num=0
for i in range(1,m):
    t_=X_test[0][i]*7+X_test[-1][i]
    if t!=t_:
        predict = dict(zip(range(i-cnt),Y_predict[cnt:i]))
        real = dict(zip(range(i-cnt),Y_test[cnt:i]))
        p=sorted(predict.items(),key=lambda kv:kv[1],reverse=True)
        r=sorted(real.items(),key=lambda kv:kv[1],reverse=True)
        p,r=get_list(p,r)
        cnt=i
        t=t_
        if len(p)<10:
            continue
        num+=1
        p=p[0:n]
        r=r[0:n]
        recall=Recall(p,r)
        ap5=AP(p,r,5)
        map5+=ap5
        ap10=AP(p,r,10)
        map10+=ap10
        ndcg5=nDCG(p,r,5)
        ndcg10=nDCG(p,r,10)
        print('Recall :{:.4f} |AP@5 :{:.4f} |AP@10 :{:.4f} |nDCG@5 :{:.4f} |nDCG@10 :{:.4f}'.format(recall,ap5,ap10,ndcg5,ndcg10))
map5/=num
map10/=num
print('MAP5 :{:.4f} |MAP10 :{:.4f}'.format(map5,map10))

Recall :0.8000 |AP@5 :1.0000 |AP@10 :0.8000 |nDCG@5 :1.0000 |nDCG@10 :0.8701
Recall :0.6000 |AP@5 :0.8000 |AP@10 :0.5314 |nDCG@5 :0.8688 |nDCG@10 :0.7008
Recall :0.8000 |AP@5 :1.0000 |AP@10 :0.7764 |nDCG@5 :1.0000 |nDCG@10 :0.8630
Recall :1.0000 |AP@5 :1.0000 |AP@10 :1.0000 |nDCG@5 :1.0000 |nDCG@10 :1.0000
Recall :0.8000 |AP@5 :0.6433 |AP@10 :0.6485 |nDCG@5 :0.7860 |nDCG@10 :0.7917
Recall :1.0000 |AP@5 :1.0000 |AP@10 :1.0000 |nDCG@5 :1.0000 |nDCG@10 :1.0000
Recall :0.9000 |AP@5 :1.0000 |AP@10 :0.9000 |nDCG@5 :1.0000 |nDCG@10 :0.9364
Recall :0.9000 |AP@5 :1.0000 |AP@10 :0.9000 |nDCG@5 :1.0000 |nDCG@10 :0.9364
Recall :1.0000 |AP@5 :1.0000 |AP@10 :1.0000 |nDCG@5 :1.0000 |nDCG@10 :1.0000
Recall :0.9000 |AP@5 :1.0000 |AP@10 :0.8521 |nDCG@5 :1.0000 |nDCG@10 :0.9216
Recall :0.9000 |AP@5 :0.7600 |AP@10 :0.8154 |nDCG@5 :0.8539 |nDCG@10 :0.9052
Recall :0.7000 |AP@5 :0.7600 |AP@10 :0.6268 |nDCG@5 :0.8539 |nDCG@10 :0.7722
Recall :0.8000 |AP@5 :0.5200 |AP@10 :0.6309 |nDCG@5 :0.6844 |nDCG@10 :0.7952