In [1]:
import pickle
import pandas as pd 
import gc
import os
import pandas as pd
import numpy as np
np.random.seed(42)
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine



# listing embedding

In [3]:
events_str = pickle.load(open('./datasets/flat_events.txt', 'rb'))
train = events_str[:int(len(events_str)*0.9)] 
test = events_str[int(len(events_str)*0.9):]

In [4]:
train2vec = Word2Vec(train, 
                    vector_size=50, 
                    min_count=1, 
                    sg=1, 
                    hs=1, 
                    negative=0, 
                    epochs=12,
                    workers=1, 
                    seed = 0,
                    window=8)

In [5]:
print('size of char dictionary:', len(train2vec.wv.key_to_index)) #0.75:251 0.9:263 1:293

size of char dictionary: 263


In [6]:
train2vec.wv.save("./word_embed.bin")
train2vec.wv.save_word2vec_format("./word_embed.own")

In [7]:
word_embeddings_index_train = np.zeros((382, 50)) 
with open('./word_embed.own', 'r') as f: 
    for i in f:
        values = i.strip('\n').split(' ')
        if len(values) == 2 or values[0] == '': 
            continue  
        word = int(values[0])
        embedding = np.asarray(values[1:], dtype='float32')                                                           
        word_embeddings_index_train[word] = embedding
        word_embeddings_index_train.astype(np.float32)

# item feature embedding

In [8]:
item_info = pd.read_csv('./datasets/item_info.csv',' ')
id_dic = item_info.set_index('item_id')['item_vec'].to_dict() 

In [15]:
item_feature_class1,item_feature_class2,item_feature_class3,item_feature_reg = np.zeros((382, 1)) ,np.zeros((382, 1)),np.zeros((382, 1)),np.zeros((382, 2)) 
for key,value in id_dic.items():
    value =value.split(',')
    item_feature_class1[key] = np.asarray(value[0], dtype='int64')
    item_feature_class2[key] = np.asarray(value[1], dtype='int64')
    item_feature_class3[key] = np.asarray(value[2], dtype='int64')
    item_feature_reg[key] = np.asarray(value[3:], dtype='float32')
item_feature = np.concatenate([item_feature_class1,item_feature_class2,item_feature_class3,item_feature_reg],axis = 1)

# item embedding

In [16]:
#how to properly range item_vec to listing embedding space
item_embedding = np.concatenate((word_embeddings_index_train,item_feature),axis=1)
np.shape(item_embedding)

(382, 55)

In [17]:
pickle.dump(item_embedding, open('item_embedding.txt','wb'),0)

## Feature1: User-item Scores

In [18]:
def user_item_score(n_user,exposed_items_v,item_embedding,events_v):
    #calculate the score of item_embedding and user_embedding
    exposed_item_embedding_v = np.zeros((n_user,9,50))
    user_item_embedding_v = np.zeros((n_user, 50))
    emb_v =  np.zeros((n_user, exposed_item_embedding_v.shape[1]+2))#######+2
    for n,i in enumerate(exposed_items_v):
        item_list = i[:].split(',')
        for m,j in enumerate(item_list):
            exposed_item_embedding_v[n,m,:] = item_embedding[int(j)][:50]
    item_event_v = events_v['item_event'].tolist()
    time_weights_v = events_v['time_weights'].tolist()
    
    for n,i in enumerate(item_event_v):
        item_list = i[1:-1].split(',')
        time_list = time_weights_v[n][1:-1].split(',')
        if item_list[0]=='0':
            user_item_embedding_v[n] += item_embedding[0][:50]*0.6
        else:
            for k,j in enumerate(item_list):
                user_item_embedding_v[n] += item_embedding[int(j)][:50]*float(time_list[k])
        user_item_embedding_v[n]/=len(item_list)
        
    for i in range(n_user):
        emb_v[i,:9] = user_item_embedding_v[i,:].dot(exposed_item_embedding_v[i,:].transpose())
    #     emb_v[i,:9] = user_item_embedding_v[i].dot(exposed_item_embedding_v[i].transpose().dot(events_v['interval'][i]))
    emb_v[:,9] = (emb_v[:,0]+emb_v[:,1]+emb_v[:,2])/3
    emb_v[:,10] = (emb_v[:,3]+emb_v[:,4]+emb_v[:,5])/3
    c_v = pd.DataFrame(emb_v, columns= ['scores']*9+['group_scores']*2)#####
    return c_v

## Feature2: Item Protrait

In [34]:
def item_protrait(n_user,events_v,exposed_items_v,item_vec_dic):
    item_protrait_v = np.zeros((n_user,9,4))
    for n,i in enumerate(exposed_items_v):
        item_list = i[:].split(',')
        for m,j in enumerate(item_list):
            item_protrait_v[n,m,:] = item_vec_dic[int(j)-1][[0,1,3,4]]
    item_protrait_v = item_protrait_v.reshape([n_user,-1])
    a_v = pd.DataFrame(item_protrait_v,columns = ['item_class','item_class','rare','probability']*9)
    return a_v

## Feature3: User Protrait
user_protrait的各列类别数：[**3**, 1363, **20**, **10**, 195, 49,**3**, **11**, **2**, 2164]（除了1、9都作为类别处理试试）

直接相加类别和为3820，实际总类别为3570，1、4、5、6、9中有重复元素

检查测试集是否与训练集类别一致就可以决定是否能当类别：[3*,1319,19,10,191,47,3*,13,2,2054]

In [20]:
def user_protrait(events_v):    
    user_protrait_v = events_v['user_protrait'].tolist()
    user_protrait_v =np.array([i[:].split(',')for i in user_protrait_v] ).astype('int64')
    b_v = pd.DataFrame(user_protrait_v,columns = ['user']*10)
    b_v['user']=b_v['user'].astype('category')
    b_v.loc[:,'interval'] = events_v['interval']
    return b_v

## Feature4: Price

In [21]:
def price(n_user,item_info,exposed_items_v):
    item_price = item_info['price'].tolist()
    p_v = np.zeros((n_user,9))
    for n,i in enumerate(exposed_items_v):
        item_list = i[:].split(',')
        for m,j in enumerate(item_list):
            p_v[n,m] = item_price[int(j)-1]
    p_v = pd.DataFrame(p_v,columns = ['price']*9)
    return p_v

## Feature5: User Class Distance

In [31]:
def user_class_distance(n_user,item_vec_dic,exposed_items_v,events_v):
    item_event_v = events_v['item_event'].tolist()
    time_weights_v = events_v['time_weights'].tolist()
    user_item_class=np.zeros((n_user, 2))#price\rare
    for n,i in enumerate(item_event_v):
        item_list = i[1:-1].split(',')
        time_list = time_weights_v[n][1:-1].split(',')
        if not item_list[0]=='0':
            for k,j in enumerate(item_list):
                user_item_class[n][:] += item_vec_dic[int(j)-1][:2]#*float(time_list[k])
        user_item_class[n]/=len(item_list)

    exposed_item_class = np.zeros((n_user,9,2))
    for n,i in enumerate(exposed_items_v):
        item_list = i[:].split(',')
        for m,j in enumerate(item_list):
            exposed_item_class[n,m,:] = item_vec_dic[int(j)-1][:2]
    exposed_item_class.shape
    cla = np.zeros((n_user, exposed_item_class.shape[1]))
    for n,user in enumerate(exposed_item_class):
        if user_item_class[n][0]:
            for k,item in enumerate(user):
                cla[n][k]=cosine(user_item_class[n][0],item[0])
        else:cla[n] = [1.0]*9#试一下倒数
    C_v  =pd.DataFrame(1/cla, columns= ['distance']*9)
    return C_v

## Concatenate Features

In [37]:
#验证集
def mydataset(event_name,data_name):
    events_v = pd.read_csv('./datasets/'+event_name+'.csv')
    item_info = pd.read_csv('./datasets/item_info.csv',' ')
    item_embedding = pickle.load(open('item_embedding.txt', 'rb'))
    exposed_items_v = events_v['exposed_items'].tolist()
    item_vec = item_info['item_vec'].tolist()
    item_vec_dic =np.array([i[:].split(',')for i in item_vec] ).astype('float64')
    n_user = len(exposed_items_v)
    c_v = user_item_score(n_user,exposed_items_v,item_embedding,events_v)
    C_v = user_class_distance(n_user,item_vec_dic,exposed_items_v,events_v)
    a_v = item_protrait(n_user,events_v,exposed_items_v,item_vec_dic)
    b_v = user_protrait(events_v)
    p_v = price(n_user,item_info,exposed_items_v)
    data_i_v = pd.concat([c_v,C_v,a_v,b_v,p_v],axis = 1)
    data_i_v.to_csv('./datasets/'+data_name+'.csv',index=False)

In [38]:
mydataset('event','data')
mydataset('event_v','data_v')

