In [1]:
import pandas as pd
import numpy as np
import sklearn
import operator
import os
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from pprint import pprint
from tensorflow import logging
import tqdm

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
input_file = './data/anchor_item_info.txt'
def read_data(input_file):
    '''
        Args:
            input_file: 主播直播过的商品
        Return:
            a dict: 主播主播过的商品为1， 没有直播过的商品为0anchor_id:[(itemid1, 1), (itemid2, 0), ……]
    '''
    # 文件不存在，那么我们直接返回空字典即可
    if not os.path.exists(input_file):
        return {}
    logging.info("================Loading {0} now!============".format(input_file))
    line_nums = 0
    anchor_item_info = {}
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            if line_nums == 0:
                anchor_items = line.strip().split(',')
                # 得到我们商品的所有id
                itemids = anchor_items[1:]
                line_nums += 1
                continue
            # 得到主播和商品的信息anchorid1 itemid1, itemid2, ……
            anchor_items = line.strip().split(',')
            anchorid = anchor_items[0]
            if anchorid not in anchor_item_info:
                anchor_item_info[anchorid] = []
            #看我们用户对 itemid 是否是喜欢的
            item_index = 0
            for flag in anchor_items[1:]:
                anchor_item_info[anchorid].append((itemids[item_index], int(flag)))
                item_index += 1
    logging.info("================文件加载完毕===============")
    return anchor_item_info    

In [3]:
anchor_item_info = read_data(input_file)
print(anchor_item_info['2'])

[('itemid1', 1), ('itemid2', 1), ('itemid3', 0), ('itemid4', 0), ('itemid5', 1), ('itemid6', 0), ('itemid7', 1), ('itemid8', 0), ('itemid9', 0)]


In [4]:
def get_anchored_item(anchor_item_info):
    '''
    获得主播已经直播过的商品,推荐时推荐还未直播过的商品
    Args:
        anchor_item_info: 主播直播商品的信息
    Return:
        a dict: 每个主播已经直播过的产品
    '''
    anchored_items = {}
    for anchorid in anchor_item_info:
        if anchorid not in anchored_items:
            anchored_items[anchorid] = []
        for items in anchor_item_info[anchorid]:
            if items[1] < 1:
                continue
            anchored_items[anchorid].append(items[0])
    return anchored_items
anchored_items = get_anchored_item(anchor_item_info)

In [5]:
def get_pos_neg_data(anchor_item_info):
    '''
        Args:
            anchor_item_info: 表示主播对每个商品的情况
        Return:
            pos_instance: 正样本
            neg_instance: 负样本
    '''
    pos_data = {}
    neg_data = {}
    for anchorId in anchor_item_info:
        if anchorId not in pos_data:
            pos_data[anchorId] = []
        if anchorId not in neg_data:
            neg_data[anchorId] = []
        for item in anchor_item_info[anchorId]:
            # 表明是不喜欢
            if item[1] < 1:
                neg_data[anchorId].append((item[0], 0))
            else:
                pos_data[anchorId].append((item[0], 1))
    logging.info("=================文件已经处理结束================")
    return neg_data, pos_data
pos_data, neg_data = get_pos_neg_data(anchor_item_info)        



In [6]:
def get_train_data(pos_data, neg_data):
    '''
        Args:
            pos_data: 正样本集
            neg_data: 负的样本集合
        Return:
            返回训练样本集
    '''
    train_data = []
    pos = []
    neg = []
    logging.info("====================正在获取训练样本集合=========================")
    for anchorId in pos_data:
        # 正负样本要负载均衡
        train_num = min(len(pos_data[anchorId]), len(neg_data[anchorId]))
        if train_num < 0:
            continue
        # 获得我们正的训练样本
        pos += [(anchorId, zuhe[0], zuhe[1]) for zuhe in pos_data[anchorId][:train_num]]
        # 获得我们负的训练样本
        neg += [(anchorId, zuhe[0], zuhe[1]) for zuhe in neg_data[anchorId][:train_num]]
        # 获得总的样本
        train_data = pos + neg
    logging.info("==========================获取结束===============================")
    return train_data
train_data = get_train_data(pos_data, neg_data)



In [7]:

def lfm_train(train_data, F, alpha, beta, step):
    """
    lfm训练的主流程
    Args:
        train_data: 训练数据集
        F: 隐含特征的维度
        alpha: 正则化参数
        beta: 学习率
        step: 步长
    Return:
        dict: key itemid, value:np.ndarray
        dict: key userid, value:np.ndarray
    """
    anchor_vec = {}
    item_vec = {}
    for step_index in range(step):
        for data_instance in train_data:
            anchorid, itemid, label = data_instance
            if anchorid not in anchor_vec:
                anchor_vec[anchorid] = init_model(F)
            if itemid not in item_vec:
                item_vec[itemid] = init_model(F)
            delta = label - model_predict(anchor_vec[anchorid], item_vec[itemid])
            for index in range(F):
                anchor_vec[anchorid][index] += beta *(delta*item_vec[itemid][index] - alpha*anchor_vec[anchorid][index])
                item_vec[itemid][index] += beta*(delta*anchor_vec[anchorid][index] - alpha*item_vec[itemid][index])
        beta = beta * 0.9
    return anchor_vec, item_vec

In [8]:

def init_model(vector_len):
    """
    初始化我们隐向量的特征
    Args:
        vector_len: 隐含特征的个数
    Return:
         a ndarray: 返回的是一个数组
    """
    return np.random.randn(vector_len)

In [9]:

def model_predict(anchor_vector, item_vector):
    """
    对主播和商品喜欢程度预测
    Args:
        anchor_vector: 主播隐含特征向量
        item_vector: 商品隐含特征向量
    Return:
         a num: 返回的是一个数值
    """
    res = np.dot(anchor_vector, item_vector)/(np.linalg.norm(anchor_vector)*np.linalg.norm(item_vector))
    return res

In [10]:
# 获取到主播和商品向量隐特征
anchor_vec, item_vec = lfm_train(train_data, 50, 0.01, 0.1, 50)
print("主播向量隐特征为：")
print(anchor_vec)
print("商品向量因特征为:")
print(item_vec)

主播向量隐特征为：
{'3': array([ 2.7232606 ,  3.04038177,  0.44095857,  0.66868442,  0.66659542,
       -1.02334486, -0.95594014,  2.48979875,  0.48613039, -0.38276206,
        2.52719102,  0.9440547 , -3.11629883, -3.60948575, -0.18279532,
        3.23115103,  1.88730073,  4.60181855,  2.9057764 , -0.06236504,
        3.27315284, -2.34794206, -1.86797494,  0.95847808, -0.78846583,
        0.88799517, -1.25742982, -3.82590741, -2.94425282, -2.73098472,
       -1.12812299, -0.67208461, -0.47037145, -2.58340915, -0.81105373,
       -0.34203936, -2.91632126, -0.49756034, -1.09935042,  4.94953366,
       -0.5677566 , -0.29161738, -1.49353431,  2.41389084,  3.40341194,
       -1.23844713, -0.71315746,  1.43869377, -1.5794628 , -0.32219318]), '13': array([ 0.22051613,  1.34978274, -1.02856017,  2.86938061, -0.0453452 ,
       -0.19551412, -1.44881248,  3.30706822, -1.64708349, -1.32194975,
        4.35122747,  0.49211122,  1.43654764,  0.45120232,  2.07047228,
        1.77131676,  0.13615968, -0.6926

In [13]:
def recom_result(anchored_items ,anchor_vec, item_vec, anchorid, topk):
    """
    对我们传进来用户进行预测
    Args:
        anchored_items: 主播之前已经直播过的商品， 推荐是进行舍弃
        anchor_item_info: 用看查看主播还没直播过的商品
        anchor_vec: 主播隐含特征向量
        item_vec: 商品隐含特征向量
        anchordd: 特定主播id
    Return:
        a list: 返回的是一个推荐结果[(itemid, score), (itemid1, score1)]
    """
    if anchorid not in anchor_vec:
        return []
    record = {}
    recom_list = []
    
    anchor_vector = anchor_vec[anchorid]
    # 得到主播已经直播过的所有产品
    items = anchored_items[anchorid]
    
    logging.info("============正在对id为{0}的主播推荐商品中……=====================".format(anchorid))
    for itemid in item_vec:
        # 如果商品已经直播过了， 跳过去
        if itemid in items:
            continue
        item_vector = item_vec[itemid]
        res = np.dot(anchor_vector, item_vector)/(np.linalg.norm(anchor_vector)*np.linalg.norm(item_vector))
        record[itemid] = res
    for zuhe in sorted(record.items(), key = lambda items: items[1], reverse=True)[:topk]:
        itemid = zuhe[0]
        score = round(zuhe[1], 3)
        recom_list.append((itemid, score))
    logging.info("============为id为{0}的主播推荐的商品为：……=====================".format(anchorid))
    pprint(recom_list)

In [14]:
recom_result(anchored_items, anchor_vec, item_vec, '2', 2)

[('itemid8', 0.218), ('itemid3', 0.171)]
