In [1]:
# -*- coding: utf-8 -*-

"""
note something here

"""

__author__ = 'Wang Chen'
__time__ = '2019/7/15'

import copy
from queue import PriorityQueue

import pandas as pd
import numpy as np

from Item import Item
from Event import Event
from Hypercube import Hypercube
from static import *

# demo for one day

class Popcache:
    """
    dim: 维度
    item_list: 视频列表，主键为视频的id
    hypercube_array: 立方空间的dim维数组
    max_day: 最大天数
    cur_day: 当天天数
    all_UITs: np.adarray 从pandas转化来的原始值
    event_lastday_list: 前一天的事件缓存列表
    event_count: 事件计数
    cache_size: 缓存列表大小
    cache_list: 缓存列表，存储视频id
    cache_set:

    if_disp: 是否展示
    """
    def __init__(self, dim, max_item_id, max_day, cur_day, all_UITs, cache_size, if_disp):
        """
        dim: 维度
        max_item_id: 最大视频id
        max_day: 最大天数
        cur_day: 当天天数
        all_UITs: np.adarray 从pandas转化来的原始值
        event_lastday_list: 前一天的事件缓存列表
        cache_size: 缓存列表大小

        if_disp: 是否展示
        """
        self.dim = dim
        self.item_list = [Item(item_id) for item_id in range(max_item_id)]
        self.hypercube_array = self.init_hypercube_array(dim)
        self.max_day = max_day
        self.cur_day = cur_day
        self.all_UITs = all_UITs
        self.curday_values = None
        self.event_lastday_list = []
        self.event_count = -1
        self.cache_size = cache_size
        self.cache_list = PriorityQueue()
        self.cache_set = set()

        self.if_disp = if_disp

    def init_hypercube_array(self, dim):
        """
        构建hypercube空间
        :param dim:
        :return:
        """
        hypercube_array = [Hypercube() for _ in range(2)]
        for _ in range(dim - 1):
            hypercube_array = [copy.deepcopy(hypercube_array) for _ in range(2)]
        return hypercube_array

    def enumerate_hypercubes(self):
        hypercubes_list = self.hypercube_array
        for depth in range(self.dim-1):
            tmp_list = []
            for element in hypercubes_list:
                tmp_list.extend(element)
            hypercubes_list = tmp_list

        if self.if_disp:
            print("enumerate {} depth, total length {}".format(dim, hypercubes_list.__len__()))

        return hypercubes_list

    def get_one_day_values(self):
        """
        获取一天的ndarray
        :return:
        """
        filter_UITs = self.all_UITs[self.all_UITs[:, 3] == self.cur_day]

        if self.if_disp:
            print("cur day {}".format(self.cur_day), end=" , ")
            print("total events {}".format(filter_UITs.shape[0]))
        return filter_UITs

    def extract_feature(self, item_id):
        """
        通过item_id获取视频的特征
        :param item_id:
        :return:
        """
        if self.cur_day > self.max_day:
            raise Exception("days exceed")

        cur_item = self.item_list[item_id]
        if np.sum(cur_item.history) == 0:
            return np.zeros((self.dim,), dtype=np.int32)

        feature = [0] * self.dim
        for pos, feature_day in enumerate(feature_day_list):
            if self.cur_day >= feature_day:
                check = np.sum(cur_item.history[self.cur_day - feature_day:self.cur_day])
            else:
                check = np.sum(cur_item.history[0:self.cur_day])
            if check > 0:
                feature[pos] = 1

        if self.if_disp:
            print("get the feature: ", np.array(feature, dtype=np.int32))

        return np.array(feature, dtype=np.int32)

    def update_feature(self):
        """
        根据ndarray更新当天视频的特征值
        :return:
        """
        count = 0
        for value_line in self.curday_values:
            item_id = value_line[2]
            self.item_list[item_id].history[self.cur_day] += 1
            count += 1

        item_count = 0
        for item in self.item_list:
            if item.history[self.cur_day]:
                item.popularity = item.history[self.cur_day]
                item_count += 1

        if self.if_disp:
            print("total deal with {} events, update {} items".format(count, item_count))

        return item_count

    def update_hypercube_estimate_value(self):
        """
        根据event_list列表更新hypercube他们的MN值
        :return:
        """

        total_popularity = 0

        if self.if_disp:
            print("update hypercube M N")

        for event in self.event_lastday_list:
            video_id = event.item
            event_feature = self.extract_feature(video_id)
            cube = self.select_hypercube(event_feature)
            popularity_plus = cube.update_popularity(self.item_list[video_id].popularity)
            total_popularity += popularity_plus

            if self.if_disp:
                print("current hypercube ", event_feature, end=" , ")
                print("popularity plus {}".format(popularity_plus))

        return total_popularity

    def update_cache_set(self):
        tmp_list = PriorityQueue()
        while not self.cache_list.empty():
            (old_priority, old_item) = self.cache_list.get()
            event_feature = self.extract_feature(old_item)
            cube = self.select_hypercube(event_feature)
            update_estimate_popularity = cube.get_popularity()
            tmp_list.put((update_estimate_popularity, old_item))

        if tmp_list.qsize() != self.cache_set.__len__():
            raise Exception("update heap error")

        self.cache_list = tmp_list

        for event in self.event_lastday_list:
            if event.item in self.cache_set:
                continue

            if len(self.cache_set) < self.cache_size:
                self.cache_set.add(event.item)
                self.cache_list.put((event.esti_popularity, event.item))
            else:
                (top_priority, top_item) = self.cache_list.queue[0]

                if event.esti_popularity >= top_priority:  # 替换条件
                    (replace_popularity, replace_item) = self.cache_list.get()
                    self.cache_set.remove(replace_item)
                    self.cache_list.put((event.esti_popularity, event.item))
                    self.cache_set.add(event.item)
                    # print("{} out and {} in".format(replace_popularity, event.esti_popularity))
        return True

    def update_one_day(self):
        """
        天数加1
        :return:
        """
        self.cur_day += 1
        self.event_lastday_list = []
        if self.cur_day > self.max_day:
            raise Exception("days exceed")

        if self.if_disp:
            print("one day adances, current day {}".format(self.cur_day))

        return self.cur_day

    def select_hypercube(self, feature):
        """
        根据视频的特征选择对应的hypercube
        :param feature:
        :return:
        """
        hypercube = self.hypercube_array
        for i in feature:
            hypercube = hypercube[i]

        assert isinstance(hypercube, Hypercube)

        if self.if_disp:
            print("current hypercube: ", feature)

        return hypercube

    def estimate_popularity(self, hypercube, event):
        """
        根据hypercube的总体估计当前事件的流行度
        :param hypercube:
        :param event_id:
        :return:
        """
        hypercube.add_event(event)       # 添加一个事件
        popularity = hypercube.get_popularity()     # 获取popularity
        event.esti_popularity = popularity

        if self.if_disp:
            print("the esti_popularity is {}".format(popularity))

        return popularity

    def event_add_oneday(self):
        """
        添加cur_day当天的event进event_list
        """
        self.curday_values = self.get_one_day_values()
        for value_line in self.curday_values:
            self.event_count += 1
            event = Event(id=value_line[0], user=value_line[1], item=value_line[2], occur_time=value_line[3], esti_popularity=0)
            self.event_lastday_list.append(event)

        if self.if_disp:
            print("today add {} events, current event pos: {}".format(self.curday_values.shape[0], self.event_count))    # for check use

        return self.event_lastday_list

    def curday_event_into_cube(self, is_validate=False):
        count = 0
        hit = 0
        for event in self.event_lastday_list:

            if self.if_disp:
                print("estimate event {}".format(event.id))

            event_feature = self.extract_feature(event.item)
            cube = self.select_hypercube(event_feature)
            _estimate_popularity = self.estimate_popularity(cube, event)

            hit += self.metric(event)
            count += 1

        if self.if_disp:
            print("total estimate {} events".format(count))

        if is_validate:
            print("today hit rate is {}".format(round(hit/count, 3)))

        return count, hit

    def print_cubes(self):
        print("day {} show cubes".format(self.cur_day))
        hypercubes_list = self.enumerate_hypercubes()

        for pos, hypercube in enumerate(hypercubes_list):
            print("current cube {}".format(pos), end=" , ")
            print("M:{} N:{}".format(hypercube.M, hypercube.N), end=" , ")
            print(hypercube.get_popularity())

    def metric(self, event):
        """
        最小堆比较，缓存击中返回1，否则返回0；
        :param event:
        :return:
        """
        hit = 0
        if event.item not in self.cache_set:
            # if len(self.cache_set) < self.cache_size:
            #     self.cache_set.add(event.item)
            #     self.cache_list.put((event.esti_popularity, event.item))
            # else:
            #     (top_priority, top_item) = self.cache_list.queue[0]
            #
            #     if event.esti_popularity >= top_priority:  # 替换条件
            #         (replace_popularity, replace_item) = self.cache_list.get()
            #         self.cache_set.remove(replace_item)
            #         self.cache_list.put((event.esti_popularity, event.item))
            #         self.cache_set.add(event.item)
                    # print("{} out and {} in".format(replace_popularity, event.esti_popularity))
            return hit
        else:
            hit = 1
            return 1

In [6]:
level = 5
# flag = "不按天更新"
#data_path = '/home/zhangxz/workspace/data/Nati1000_U500_V30/'
data_path = '/home/ubuntu/data/dataset/R1584_U50_V2/'
#part_one = pd.read_csv(data_path + 'train.csv', header=None)
#part_two = pd.read_csv(data_path + 'test.csv', header=None)
#UIT = pd.concat([part_one, part_two], axis=0)
UIT = pd.read_csv(data_path + 'UIT.csv', header=None)
UIT = UIT[UIT[2]<18]
UIT[2] = UIT[8]//60 
group_count = 0
result_list = []

trainT = 12*24
testT = 6*24

days = trainT + testT
feature_day_list[-1] = 10*24
print(UIT.shape[0],feature_day_list,days)

item_num = 10000
#list_size = [int(item_num * ratio) for ratio in [0.001,0.0025,0.005,0.0075,0.01,0.025,0.05,0.075,0.1,0.25]]
list_size = [int(item_num * ratio) for ratio in [0.001,0.0025,0.005,0.0075]]
#list_size = [int(item_num * ratio) for ratio in [0.01,0.02,1/30,0.05]]

for cache_size_ratio in list_size:
# for cache_size_ratio in [0.001, 0.0025, 0.005, 0.01]:
    total_each_day_item = np.zeros((testT,), dtype=np.int32)
    total_each_day_hit = np.zeros((testT,), dtype=np.int32)
    for name, group in (UIT.groupby([level])):
        print(name,type(group))
        #print(name, group )
        group_count += 1
        # items_num = group[1].unique().shape[0]
        #group[2] = group[8] // 60 #一个小时
        all_UITs = group.values
        items_seq = max(all_UITs[:, 1] + 1)
        index = np.arange(0, all_UITs.shape[0]).reshape((all_UITs.shape[0], 1))
        all_UITs = np.column_stack((index, all_UITs))

        if cache_size_ratio > 1:
            cache_size = cache_size_ratio
        else:
            cache_size = int(item_num * cache_size_ratio)
        popcache = Popcache(dim, items_seq, days, cur_day=0, all_UITs=all_UITs, cache_size=cache_size,if_disp=False)
        for i in range(trainT):
            event_list = popcache.event_add_oneday()
            item_count, hit = popcache.curday_event_into_cube()
            popcache.update_feature()
            total_popularity = popcache.update_hypercube_estimate_value()
            popcache.update_cache_set()
            # popcache.print_cubes()
            # popcache.enumerate_hypercubes()
            popcache.update_one_day()
        day_k = 1
        count_day = 0
        hit_day = 0
        for i in range(trainT, trainT+testT, 1):
            event_list = popcache.event_add_oneday()
            item_count, hit = popcache.curday_event_into_cube(is_validate=False)
            total_each_day_item[i - trainT] += item_count
            total_each_day_hit[i - trainT] += hit
            popcache.update_feature()
            total_popularity = popcache.update_hypercube_estimate_value()
            popcache.update_cache_set()
            # popcache.print_cubes()
            # popcache.enumerate_hypercubes()
            popcache.update_one_day()
            count_day += item_count
            hit_day += hit
            if i == trainT + day_k*24-1:
                print(hit_day/count_day,end=',')
                day_k +=1
                count_day = 0
                hit_day = 0       
                
    each_day_hit_rate = np.sum(total_each_day_hit)/np.sum(total_each_day_item)
    #print(total_each_day_hit)
    #print(total_each_day_item)
    print()
    print(each_day_hit_rate.round(4), each_day_hit_rate.max().round(4))
    result_list.append(each_day_hit_rate.max().round(4))
print(result_list)

11687829 [5, 30, 120, 240] 432
0 <class 'pandas.core.frame.DataFrame'>


KeyboardInterrupt: 

In [4]:
level = 5
# flag = "不按天更新"
#data_path = '/home/zhangxz/workspace/data/Nati1000_U500_V30/'
data_path = '/home/zhangxz/workspace/data/R1584_U50_V2/'
#part_one = pd.read_csv(data_path + 'train.csv', header=None)
#part_two = pd.read_csv(data_path + 'test.csv', header=None)
#UIT = pd.concat([part_one, part_two], axis=0)
UIT = pd.read_csv(data_path + 'UIT.csv', header=None)
UIT = UIT[UIT[2]<24]
UIT[2] = UIT[8]//60 
group_count = 0
result_list = []

trainT = 18*24
testT = 6*24

days = trainT + testT
feature_day_list[-1] = 15*24
print(UIT.shape[0],feature_day_list,days)

item_num = 20000
#list_size = [int(item_num * ratio) for ratio in [0.001,0.0025,0.005,0.0075,0.01,0.025,0.05,0.075,0.1,0.25]]
list_size = [int(item_num * ratio) for ratio in [0.001,0.0025,0.005,0.0075]]

for cache_size_ratio in list_size:
# for cache_size_ratio in [0.001, 0.0025, 0.005, 0.01]:
    total_each_day_item = np.zeros((testT,), dtype=np.int32)
    total_each_day_hit = np.zeros((testT,), dtype=np.int32)
    for name, group in (UIT.groupby([level])):
        print(name)
        #print(name, group )
        group_count += 1
        # items_num = group[1].unique().shape[0]
        #group[2] = group[8] // 60 #一个小时
        all_UITs = group.values
        items_seq = max(all_UITs[:, 1] + 1)
        index = np.arange(0, all_UITs.shape[0]).reshape((all_UITs.shape[0], 1))
        all_UITs = np.column_stack((index, all_UITs))

        if cache_size_ratio > 1:
            cache_size = cache_size_ratio
        else:
            cache_size = int(item_num * cache_size_ratio)
        popcache = Popcache(dim, items_seq, days, cur_day=0, all_UITs=all_UITs, cache_size=cache_size,if_disp=False)
        for i in range(trainT):
            event_list = popcache.event_add_oneday()
            item_count, hit = popcache.curday_event_into_cube()
            popcache.update_feature()
            total_popularity = popcache.update_hypercube_estimate_value()
            popcache.update_cache_set()
            # popcache.print_cubes()
            # popcache.enumerate_hypercubes()
            popcache.update_one_day()
        day_k = 1
        count_day = 0
        hit_day = 0
        for i in range(trainT, trainT+testT, 1):
            event_list = popcache.event_add_oneday()
            item_count, hit = popcache.curday_event_into_cube(is_validate=False)
            total_each_day_item[i - trainT] += item_count
            total_each_day_hit[i - trainT] += hit
            popcache.update_feature()
            total_popularity = popcache.update_hypercube_estimate_value()
            popcache.update_cache_set()
            # popcache.print_cubes()
            # popcache.enumerate_hypercubes()
            popcache.update_one_day()
            count_day += item_count
            hit_day += hit
            if i == trainT + day_k*24-1:
                print(hit_day/count_day,end=',')
                day_k +=1
                count_day = 0
                hit_day = 0       
                
    each_day_hit_rate = np.sum(total_each_day_hit)/np.sum(total_each_day_item)
    #print(total_each_day_hit)
    #print(total_each_day_item)
    print()
    print(each_day_hit_rate.round(4), each_day_hit_rate.max().round(4))
    result_list.append(each_day_hit_rate.max().round(4))
print(result_list)

13848804 [1, 6, 24, 360] 576
0
0.005822326935424869,0.0060484111369462825,0.005359465263285243,0.00403950993760377,0.003417452113104287,0.006454454534014214,
0.0051 0.0051
0
0.01266279364251015,0.014278900614082156,0.013867465142304086,0.011315232713123163,0.011346913955609607,0.014021631860962638,
0.0128 0.0128
0
0.022649772709249467,0.026149168306206404,0.024544052263859903,0.019575889830616876,0.019395560960778352,0.024658731893417052,
0.0226 0.0226
0
0.03678093878626484,0.039769272044357006,0.036049360312131386,0.03024643824717503,0.029385223472179995,0.036941734390419985,
0.0346 0.0346
[0.0051, 0.0128, 0.0226, 0.0346]


In [5]:
level = 5
# flag = "不按天更新"
#data_path = '/home/zhangxz/workspace/data/Nati1000_U500_V30/'
data_path = '/home/zhangxz/workspace/data/R1584_U50_V2/'
#part_one = pd.read_csv(data_path + 'train.csv', header=None)
#part_two = pd.read_csv(data_path + 'test.csv', header=None)
#UIT = pd.concat([part_one, part_two], axis=0)
UIT = pd.read_csv(data_path + 'UIT.csv', header=None)
UIT = UIT[UIT[2]<30]
UIT[2] = UIT[8]//60 
group_count = 0
result_list = []

trainT = 24*24
testT = 6*24

days = trainT + testT
feature_day_list[-1] = 20*24
print(UIT.shape[0],feature_day_list,days)

item_num = 20000
#list_size = [int(item_num * ratio) for ratio in [0.001,0.0025,0.005,0.0075,0.01,0.025,0.05,0.075,0.1,0.25]]
list_size = [int(item_num * ratio) for ratio in [0.001,0.0025,0.005,0.0075]]
#list_size = [int(item_num * ratio) for ratio in [0.01,0.02,1/30,0.05]]

for cache_size_ratio in list_size:
# for cache_size_ratio in [0.001, 0.0025, 0.005, 0.01]:
    total_each_day_item = np.zeros((testT,), dtype=np.int32)
    total_each_day_hit = np.zeros((testT,), dtype=np.int32)
    for name, group in (UIT.groupby([level])):
        print(name)
        group_count += 1
        # items_num = group[1].unique().shape[0]
        #group[2] = group[8] // 60 #一个小时
        all_UITs = group.values
        items_seq = max(all_UITs[:, 1] + 1)
        index = np.arange(0, all_UITs.shape[0]).reshape((all_UITs.shape[0], 1))
        all_UITs = np.column_stack((index, all_UITs))

        if cache_size_ratio > 1:
            cache_size = cache_size_ratio
        else:
            cache_size = int(item_num * cache_size_ratio)
        popcache = Popcache(dim, items_seq, days, cur_day=0, all_UITs=all_UITs, cache_size=cache_size,if_disp=False)
        for i in range(trainT):
            event_list = popcache.event_add_oneday()
            item_count, hit = popcache.curday_event_into_cube()
            popcache.update_feature()
            total_popularity = popcache.update_hypercube_estimate_value()
            popcache.update_cache_set()
            # popcache.print_cubes()
            # popcache.enumerate_hypercubes()
            popcache.update_one_day()
        day_k = 1
        count_day = 0
        hit_day = 0
        for i in range(trainT, trainT+testT, 1):
            event_list = popcache.event_add_oneday()
            item_count, hit = popcache.curday_event_into_cube(is_validate=False)
            total_each_day_item[i - trainT] += item_count
            total_each_day_hit[i - trainT] += hit
            popcache.update_feature()
            total_popularity = popcache.update_hypercube_estimate_value()
            popcache.update_cache_set()
            # popcache.print_cubes()
            # popcache.enumerate_hypercubes()
            popcache.update_one_day()
            count_day += item_count
            hit_day += hit
            if i == trainT + day_k*24-1:
                print(hit_day/count_day,end=',')
                day_k +=1
                count_day = 0
                hit_day = 0  
        print()       
    each_day_hit_rate = np.sum(total_each_day_hit)/np.sum(total_each_day_item)
    #print(total_each_day_hit)
    #print(total_each_day_item)
    print()
    print(each_day_hit_rate.round(4), each_day_hit_rate.max().round(4))
    result_list.append(each_day_hit_rate.max().round(4))
print(result_list)

15841209 [1, 6, 24, 480] 720
0
0.006472613758497231,0.0067994495850826325,0.005343371083060215,0.005158418770984216,0.004576032143835547,0.004398793104244928,

0.0053 0.0053
0
0.015070818814734694,0.014027155674749716,0.01361441916979445,0.011860504007100866,0.00989021581419889,0.009927318260998137,

0.0121 0.0121
0
0.025869917920001095,0.023806849295833977,0.02483838285506196,0.021231845839175683,0.019635845246912954,0.019330200579420857,

0.0221 0.0221
0
0.04047865167000965,0.035562841376598944,0.03844919651130509,0.032854367932901966,0.03277291313656057,0.03129288476733064,

0.0348 0.0348
[0.0053, 0.0121, 0.0221, 0.0348]


In [6]:
np.set_printoptions(linewidth=170)
hitrate1 = np.array([0.0048, 0.0133, 0.0227, 0.0342, 0.0458, 0.1158, 0.2274,0.3365, 0.4451, 0.8016])
hitrate2 =np.array([0.0051, 0.0128, 0.0226, 0.0346,0.0462, 0.1149, 0.2289, 0.3414, 0.4497, 0.7744])
hitrate3 = np.array([0.0053, 0.0121, 0.0221, 0.0348, 0.047, 0.1159, 0.2621,0.3601, 0.472, 0.7774])
(UIT[(UIT[2]>=12)&(UIT[2]<18)].shape[0]*hitrate1+UIT[(UIT[2]>=18)&(UIT[2]<24)].shape[0]*hitrate2+UIT[(UIT[2]>=24)&(UIT[2]<30)].shape[0]*hitrate3)/UIT[(UIT[2]>=12)&(UIT[2]<30)].shape[0]

array([0.00502221, 0.01286779, 0.02255282, 0.03448503, 0.04618626, 0.11540334, 0.23394396, 0.34273337, 0.45175231, 0.78501887])