In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from datetime import datetime, timedelta
import math

np.random.seed(555)

#### **数据处理**
##### 数据集来源为FourSquare - NYC and Tokyo Check-ins
##### 选择dataset_TSMC2014_NYC.csv中打卡数最多的100个用户和500个地点作为数据集。
##### 通过时间戳和偏移时间得到正确时间，并归类于用户在一周中第几天在某地Check-in，将时间信息记录在df[’dayweek‘]中，范围为0-6（周一-周
##### 天）。之后对数据进行分组，将用户和时间作为分组基准，在每个组中对包含地点的Check-in次数进行计数，提取出隐式特征。
##### 以‘uid’和’dayweek‘作为基准来划分训练集和测试集，其中训练集占比为0.8，之后进行训练得到结果

In [None]:
data_path="../input/foursquare-nyc-and-tokyo-checkin-dataset/dataset_TSMC2014_NYC.csv"
custom_date_parser = lambda x: datetime.strptime(x, "%a %b %d %H:%M:%S +0000 %Y")
df = pd.read_csv(data_path,parse_dates=['utcTimestamp'],date_parser=custom_date_parser)

In [None]:
def get_ids(k_data,n):
    k_dict=dict()
    for k in k_data:
        if k_dict.get(k)==None:
            k_dict[k]=0
        k_dict[k]+=1
    rank=sorted(k_dict.items(),key=lambda kv:kv[1],reverse=True)
    k_ids=[]
    for i in range(min(n,len(rank))):
        k_ids.append(rank[i][0])
    return k_ids

In [None]:
def date_convert(date_to_convert):
    pru = datetime.datetime.strptime(date_to_convert, '%a %b %d %H:%M:%S +0000 %Y')+ datetime.timedelta(hours=-240//60)
    return pru.strftime('%b %d %Y %H:%M:%S')

def min2hour(m):
    return  timedelta(hours=m/60)

In [None]:
class DataProcess:
    def __init__(self, data_path, n_user=100, n_venue=500, n_venuecat=40, alpha=0.8):
        print("start load data")
        custom_date_parser = lambda x: datetime.strptime(x, "%a %b %d %H:%M:%S +0000 %Y")
        df = pd.read_csv(data_path,parse_dates=['utcTimestamp'],date_parser=custom_date_parser)
        print("start process data")
        df["realtime"] = df['utcTimestamp'] + df['timezoneOffset'].apply(min2hour)
        df["hour"] = df["realtime"].dt.hour 
        #df["work"] = df['hour'].apply(lambda x: 'Not' if (17 <= x < 24) or (0 <= x < 9)else 'Yes')
        df["dayweek"] = df["realtime"].dt.dayofweek 
        #df["workday"] = df['dayweek'].apply(lambda x: 'True' if x <= 5 else 'False')
        
        user_ids=get_ids(df['userId'].values,n_user)
        df = df[(df['userId'].isin(user_ids))]
        venue_ids=get_ids(df['venueId'].values,n_venue)
        df = df[(df['venueId'].isin(venue_ids))]
        venuecat_ids=get_ids(df['venueCategoryId'].values,n_venuecat)
        df = df[(df['venueCategoryId'].isin(venuecat_ids))]
        user_ids=get_ids(df['userId'].values,n_user)
        venue_ids=get_ids(df['venueId'].values,n_venue)
        venuecat_ids=get_ids(df['venueCategoryId'].values,n_venuecat)
        
        self.n_user=len(user_ids)
        self.n_venue=len(venue_ids)
        self.n_venuecat=len(venuecat_ids)
        print("Number of users = {} , Number of venues = {} , Number of venuecategorys = {}".format(self.n_user,self.n_venue,self.n_venuecat))
        
        u_id2idx = dict(zip(user_ids, range(self.n_user)))
        v_id2idx = dict(zip(venue_ids, range(self.n_venue)))
        vc_id2idx = dict(zip(venuecat_ids, range(self.n_venuecat)))
        
        df = df[(df['userId'].isin(user_ids))&(df['venueId'].isin(venue_ids))&(df['venueCategoryId'].isin(venuecat_ids))]
        df = df.assign(userId=[u_id2idx[uid] for uid in df.userId])
        df = df.assign(venueId=[v_id2idx[vid] for vid in df.venueId])
        df = df.assign(venueCategoryId=[vc_id2idx[vcid] for vcid in df.venueCategoryId])
        headers = ['userId','venueId','venueCategoryId','hour','dayweek']
        df = pd.DataFrame(df,columns = headers)
        df = df.reset_index(drop=True)
        self.df = df
        self.v2vc=dict()
        for i in range(len(df)):
            self.v2vc[df.loc[i,'venueId']]=df.loc[i,'venueCategoryId']

#### **FM**
##### 纯手写，没调keras库函数

In [None]:
class FactorizationMachine():
    def __init__(self, k=5, lr=0.001, iterations=100): 
        self.w0 = None 
        self.w = None 
        self.v = None 
        self.k = k 
        self.lr = lr
        self.iterations = iterations
        self.losses = []
        
        
    def FM_cul(self, x, x_iter):
        inter2 = np.sum((x_iter.dot(self.v)) ** 2 - (x_iter ** 2).dot(self.v ** 2)) 
        y = self.w0 + x.dot(self.w) + inter2 / 2  
        return y[0]
        
        
    def train(self, X, X_iter, Y):
        m, n = np.shape(X)
        n1 = np.shape(X_iter)[1]
        self.w0 = 0
        self.w = np.random.uniform(size=(n, 1))
        self.v = np.random.uniform(size=(n1, self.k))
        
        for it in range(self.iterations):
            loss = 0  
            for i in range(m):  
                y = self.FM_cul(x=X[i],x_iter=X_iter[i])  
                loss += pow(Y[i]-y,2)  #计算损失函数值
                dloss_w0 = -2*(Y[i]-y)# 对w0求导
                self.w0 = self.w0 - self.lr * dloss_w0 
                for j in range(n): 
                    if X[i][j] > 0:
                        dloss_wj = dloss_w0 * X[i][j]  # 对wj求导
                        self.w[j] = self.w[j] - self.lr * dloss_wj 
                for j in range(n1):
                    if X_iter[i][j] > 0:
                        for f in range(self.k):  # 对vjf求导
                            dloss_vjf = dloss_w0 * (X_iter[i][j] * (X_iter[i].dot(self.v[:, f])) - self.v[j][f] * X_iter[i][j] ** 2)
                            self.v[j][f] = self.v[j][f] - self.lr * dloss_vjf 
                            
            self.losses.append(loss/m)
            if it % 10==0:
                print("iter :{}| loss :{:.4f}".format(it, loss/m))
                
    def evaluate(self, X, X_iter, Y):
        Y_predict=[]
        m=X.shape[0]
        for i in range(m):  # 遍历测试集
            y = self.FM_cul(x=X[i],x_iter=X_iter[i])  # FM的模型方程 
            Y_predict.append(y)
        return Y_predict
    

#### **数据导入及二次处理**
##### 这些本应放在DataProcess里面，但是由于不方便测试而且DataProcess中内容已经很多，所以我把部分数据处理放到后面

In [None]:
data_path="../input/foursquare-nyc-and-tokyo-checkin-dataset/dataset_TSMC2014_NYC.csv"
dp = DataProcess(data_path)

In [None]:
user_day_venue_mx=np.zeros((dp.n_user*7,dp.n_venue))
for i in range(len(dp.df)):
    u=dp.df.loc[i,'userId']
    d=dp.df.loc[i,'dayweek']
    v=dp.df.loc[i,'venueId']
    user_day_venue_mx[u*7+d][v]+=1
user_day_venue_mx=user_day_venue_mx[np.sum(user_day_venue_mx,axis=-1)>10]

In [None]:
alpha=0.8
train_data=[]
test_data=[]
for i in range(user_day_venue_mx.shape[0]):
    if np.random.rand()<alpha:
        for j in range(user_day_venue_mx.shape[1]):
            train_data.append([i//7,j,dp.v2vc[j],i&7,user_day_venue_mx[i][j]])
    else:
        for j in range(user_day_venue_mx.shape[1]):
            test_data.append([i//7,j,dp.v2vc[j],i&7,user_day_venue_mx[i][j]])
train_data=np.array(train_data)
test_data=np.array(test_data)

In [None]:
df_train=pd.DataFrame(train_data,columns=['uid','vid','vcid','day','times'])
df_test=pd.DataFrame(test_data,columns=['uid','vid','vcid','day','times'])
df_train=df_train[df_train['times']>0]
df_test=df_test[df_test['times']>0]

In [None]:
df=pd.concat([df_train,df_test])
df = df.reset_index(drop=True)

df_iter = df[['uid','vid']]
df_iter = pd.get_dummies(df_iter,columns=['uid','vid'])
df_train_iter = df_iter.iloc[:len(df_train),:]
df_test_iter = df_iter.iloc[len(df_train):,:]

df = pd.get_dummies(df,columns=['uid','vid','vcid','day'])
df_train = df.iloc[:len(df_train),:]
df_test = df.iloc[len(df_train):,:]

In [None]:
X_train = df_train.drop('times', axis=1).values
X_train_iter = df_train_iter.values
Y_train = df_train['times'].values
X_test = df_test.drop('times', axis=1).values
X_test_iter = df_test_iter.values
Y_test = df_test['times'].values

#### **开始训练**
##### k=10, lr=0.001, iterations=100

In [None]:
md=FactorizationMachine(k=10, lr=0.001, iterations=100)
md.train(X_train, X_train_iter, Y_train)

#### **绘制训练过程中loss变化**

In [None]:
plt.plot(md.losses)
plt.xlabel('Iteration')
plt.ylabel('loss')

#### **构建Recall、MAP、nDCG等相关函数**

In [None]:
def get_list(p,r):
    p_=[]
    r_=[]
    for i in range(len(p)):
        p_.append(p[i][0])
        r_.append(r[i][0])
    return p_,r_
def Recall(p,r):
    num=0
    for i in range(len(p)):
        if p[i] in r:
            num+=1
    return num/len(p)

In [None]:
def AP(p,r,k):
    sum_ap=num=0
    n=len(p)
    for i in range(k):
        if p[i] in r:
            num+=1
            sum_ap+=num/(i+1)
    return sum_ap/(max(k,n))

In [None]:
def nDCG(p,r,k):
    sum_dcg=0
    sum_idcg=0
    for i in range(k):
        sum_idcg+=1/math.log(i+2,2)
        if p[i] in r:
            sum_dcg+=1/math.log(i+2,2)
    return sum_dcg/sum_idcg    

#### **开始测试**
##### 由于测试中有多组数据，每组数据都有对应计算的Recall、AP@5、AP@10、nDCG@5、nDCG@10
##### 求平均值得到MAP@5、MAP@10

In [None]:
Y_predict=md.evaluate(X_test, X_test_iter, Y_test)
n=10
m=len(Y_predict)
t=[X_test[0].nonzero()[0][0],X_test[0].nonzero()[0][-1]]
cnt=0
map5=0
map10=0
num=0
for i in range(1,m):
    t_=[X_test[i].nonzero()[0][0],X_test[i].nonzero()[0][-1]]
    if t!=t_:
        predict = dict(zip(range(i-cnt),Y_predict[cnt:i]))
        real = dict(zip(range(i-cnt),Y_test[cnt:i]))
        p=sorted(predict.items(),key=lambda kv:kv[1],reverse=True)
        r=sorted(real.items(),key=lambda kv:kv[1],reverse=True)
        p,r=get_list(p,r)
        cnt=i
        t=t_
        if len(p)<10:
            continue
        num+=1
        p=p[0:n]
        r=r[0:n]
        recall=Recall(p,r)
        ap5=AP(p,r,5)
        map5+=ap5
        ap10=AP(p,r,10)
        map10+=ap10
        ndcg5=nDCG(p,r,5)
        ndcg10=nDCG(p,r,10)
        print('Recall :{:.4f} |AP@5 :{:.4f} |AP@10 :{:.4f} |nDCG@5 :{:.4f} |nDCG@10 :{:.4f}'.format(recall,ap5,ap10,ndcg5,ndcg10))
map5/=num
map10/=num
print('MAP5 :{:.4f} |MAP10 :{:.4f}'.format(map5,map10))