In [7]:
import pandas as pd
import numpy as np
import random

from keras.layers import Input, Embedding, Dot, Reshape, Dense
from keras.models import Model
random.seed(100)

#load dataset
user_keywords = pd.read_csv('user_keywords.csv',encoding='gbk')


In [9]:
def date_process(user_item):
    """user_item is a DataFrame, column=[user_id, keywords]   
    1. user_item: user and item information, user_id, keywords, keyword_index
    2. user_index: user to index
    3. item_index：item to index
    """
    user_item["keywords"] = user_item["keywords"].apply(lambda x: x.split("|"))
    keyword_list = [] 
    for i in user_item["keywords"]:
        keyword_list.extend(i)
        
    #word count
    item_count = pd.DataFrame(pd.Series(keyword_list).value_counts()) 
    # add index to word_count
    item_count['id'] = list(range(0, len(item_count)))
    
    #将word的id对应起来
    map_index = lambda x: list(item_count['id'][x])
    user_item['keyword_index'] = user_item['keywords'].apply(map_index) #速度太慢
    #create user_index, item_index
    user_index = { v:k for k,v in user_item["user_id"].to_dict().items()}
    item_index = item_count["id"].to_dict()
    return user_item, user_index, item_index

user_keywords, user_index, keyword_index = date_process(user_keywords)


In [22]:
keyword_index

{'2018': 14,
 '人工分类': 0,
 '内容推荐': 7,
 '冷启动': 3,
 '分类': 5,
 '声音': 10,
 '文本分类': 6,
 '新年愿望': 19,
 '新闻推荐': 1,
 '机器学习': 11,
 '梦想': 9,
 '父母': 20,
 '睡眠': 18,
 '精神衰弱': 21,
 '网络': 16,
 '聚类': 2,
 '肩头': 13,
 '自然语言处理': 4,
 '蔬菜块': 12,
 '资讯推荐': 8,
 '辞旧迎新': 15,
 '青春叛逆期': 22,
 '饺子': 17}

In [11]:
def create_pairs(user_keywords, user_index):
    """
    generate user, keyword pair list
    """
    pairs = []
    def doc2tag(pairs, x):
        for index in x["keyword_index"]:
            pairs.append((user_index[x["user_id"]], index))
    user_keywords.apply(lambda x: doc2tag(pairs, x), axis=1) #速度太慢
    return pairs

pairs = create_pairs(user_keywords, user_index)

In [14]:
user_index

{12: 6, 113: 0, 117: 4, 119: 5, 122: 7, 123: 2, 143: 1, 234: 3}

In [15]:
def build_embedding_model(embedding_size = 50, classification = False):
    """Model to embed users and keywords using the Keras functional API.
       Trained to discern if a keyword is clicked by user"""
    
    # Both inputs are 1-dimensional
    user = Input(name = 'user', shape = [1])
    keyword = Input(name = 'keyword', shape = [1])
    
    # Embedding the user default: (shape will be (None, 1, 50))
    user_embedding = Embedding(name = 'user_embedding',
                               input_dim = len(user_index),
                               output_dim = embedding_size)(user)
    
    # Embedding the keyword default: (shape will be (None, 1, 50))
    keyword_embedding = Embedding(name = 'keyword_embedding',
                               input_dim = len(keyword_index),
                               output_dim = embedding_size)(keyword)
    
    # Merge the layers with a dot product along the second axis 
    # (shape will be (None, 1, 1))
    merged = Dot(name = 'dot_product', normalize = True,
                 axes = 2)([user_embedding, keyword_embedding])
    
    # Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape = [1])(merged)
    
    # Squash outputs for classification
    out = Dense(1, activation = 'sigmoid')(merged)
    model = Model(inputs = [user, keyword], outputs = out)
    
    # Compile using specified optimizer and loss 
    model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])
    #print(model.summary())
    return model

model = build_embedding_model(embedding_size = 20, classification = False)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [16]:
model

<keras.engine.training.Model at 0x7fac977c15c0>

In [17]:
def generate_batch(pairs, n_positive = 50, negative_ratio = 1.0):
    """Generate batches of samples for training. 
       Random select positive samples
       from pairs and randomly select negatives."""
    
    # Create empty array to hold batch
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    # Continue to yield samples
    while True:
        # Randomly choose positive examples
        for idx, (user_id, keyword_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (user_id, keyword_id, 1)
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # Random selection
            random_user = random.randrange(len(user_index))
            random_keyword = random.randrange(len(keyword_index))
            #print(random_user, random_keyword)
            
            # Check to make sure this is not a positive example
            if (random_user, random_keyword) not in pairs:
                
                # Add to batch and increment index
                batch[idx, :] = (random_user, random_keyword, 0)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {'user': batch[:, 0], 'keyword': batch[:, 1]}, batch[:, 2]
        
        
n_positive = len(pairs)
gen = generate_batch(pairs, n_positive, negative_ratio = 1)
# Train
h = model.fit_generator(gen, epochs = 100, steps_per_epoch = len(pairs) // n_positive)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 7

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [19]:
user_layer = model.get_layer('user_embedding')
user_weights = user_layer.get_weights()[0]


keyword_layer = model.get_layer('keyword_embedding')
keyword_weights = keyword_layer.get_weights()[0]

from sklearn.decomposition import PCA
import seaborn as sns

#PCA可视化
def pca_show():
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(user_weights)
    sns.jointplot(x=pca_result[:,0], y=pca_result[:,1])
pca_show()


#calculate cosine similarity 
from sklearn.metrics.pairwise import cosine_similarity
cos = cosine_similarity(user_weights[0:1], user_weights)
recommendations = cos[0].argsort()[-4:][::-1]

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


上面我们假设了每篇doc的keywords就是user对应的keywords，因此，我们可以直接通过计算weights 的cosine相似度进行推荐。

结果为：[5、0、4、6]，去掉第一个为自己本身，我们可以得到推荐下标为0、4、6的三篇用户。然后我们可以从这几个用户的user behavior里面，筛选出最近点击的或者最喜欢的doc给用户119。

In [30]:
user_keywords

Unnamed: 0,user_id,keywords,keyword_index
0,113,"[新闻推荐, 资讯推荐, 内容推荐, 文本分类, 人工分类, 自然语言处理, 聚类, 分类,...","[1, 8, 7, 6, 0, 4, 2, 5, 3]"
1,143,"[网络, 睡眠, 精神衰弱, 声音, 人工分类]","[16, 18, 21, 10, 0]"
2,123,"[新年愿望, 梦想, 2018, 辞旧迎新]","[19, 9, 14, 15]"
3,234,"[父母, 肩头, 饺子, 蔬菜块, 青春叛逆期, 声音]","[20, 13, 17, 12, 22, 10]"
4,117,"[新闻推荐, 内容推荐, 文本分类, 人工分类, 自然语言处理, 聚类, 分类, 冷启动]","[1, 7, 6, 0, 4, 2, 5, 3]"
5,119,"[新闻推荐, 资讯推荐, 人工分类, 自然语言处理, 聚类, 分类, 冷启动]","[1, 8, 0, 4, 2, 5, 3]"
6,12,"[新闻推荐, 资讯推荐, 内容推荐, 文本分类, 聚类, 分类, 冷启动]","[1, 8, 7, 6, 2, 5, 3]"
7,122,"[机器学习, 新闻推荐, 梦想, 人工分类, 自然语言处理]","[11, 1, 9, 0, 4]"
