# LOAD USER AND DOC INFO

In [23]:
from collections import namedtuple
import pandas as pd
from typing import Dict
import numpy as np
import tensorflow as tf
from tensorflow import keras

DATA_DIR = "/Users/hanshen/work/AI-RecommenderSystem/Dataset/news_data_bigger"

user_info_cols = ["userid", "device", "operating_system", "province", "city", "age", "gender"]
doc_info_cols = ["docid", "title", "create_time", "image_num", "cate1", "cate2", "keywords"]
show_info_cols = ["userid", "docid", "exp_time", "network", "rt", "rit", "click", "reading_time"]

UserInfo = namedtuple("Userinfo", user_info_cols)

DocInfo = namedtuple("DocInfo", doc_info_cols)

In [24]:
def clean_create_time(value):
    if len(value) == 0:
        return np.uint32(0)
    return np.uint32(value)

def clean_image_num(value):
    if len(value) == 0:
        return np.uint8(0)
    return np.uint8(value)

In [25]:
user_info = pd.read_csv(
    "/Users/hanshen/work/AI-RecommenderSystem/Dataset/news_data_bigger/user_info.txt",
    sep="\t", header=None, names=user_info_cols,
    dtype=str,
    keep_default_na=False
)

doc_info = pd.read_csv(
    "/Users/hanshen/work/AI-RecommenderSystem/Dataset/news_data_bigger/doc_info.txt",
    sep="\t", header=None, names=doc_info_cols,
    converters={
        "create_time": clean_create_time,
        "image_num": clean_image_num,
    },
    dtype={
        "docid": str
    }
)

show_info = pd.read_csv(
    "/Users/hanshen/work/AI-RecommenderSystem/Dataset/news_data_bigger/sorted_train_data.txt",
    sep="\t", names=show_info_cols,
    dtype=str,
    keep_default_na=False,
    nrows=1000000,
)

In [26]:
def reduce_proba(row: str):
    '''Suppose row has the following format: key1:[float],key2:[float]
    '''
    if not isinstance(row, str):
        return "UNK"
    if len(row) == 0:
        return "UNK"
    classes = row.split(",")
    assert len(classes) >= 1, "unkown format: [{}]".format(row)
    max_proba = 0
    max_class = "UNK"
    for cls_pair in classes:
        cls, proba = cls_pair.split(":")
        if float(proba) > max_proba:
            max_class = cls
            max_proba = float(proba)
    return max_class

In [27]:
user_info["age"] = user_info["age"].apply(reduce_proba)
user_info["gender"] = user_info["gender"].apply(reduce_proba)

In [28]:
user_info_dict: Dict[int, UserInfo] = {}
doc_info_dict: Dict[int, DocInfo] = {}

for row in user_info.iterrows():
    user_info_dict[row[1]["userid"]] = UserInfo(*(row[1]))

for row in doc_info.iterrows():
    doc_info_dict[row[1]["docid"]] = DocInfo(*(row[1]))

In [29]:
doc_info_dict["349635709"]

DocInfo(docid='349635709', title='拿到c1驾照后,实习期扣分了会怎样?扣12分驾照会吊销么?', create_time=561940664, image_num=9, cate1='汽车', cate2='汽车/用车', keywords='上班族:8.469502,买车:8.137443,二手车:9.022247,副页:11.218712,国人:5.104467,大学生:7.731338,家庭:6.529803,家用车:7.034796,标志:9.054356,汽车:7.582007,注意^^事项:7.826521,独立:7.015873,行业:6.600394,车主:9.498086,车尾:5.677475,违章行为:9.028495,驾照^^实习期:7.906046,驾考:11.347330,驾驶证:9.018305,驾驶证^^副页:7.525696')

# MODIFY DATA

## basic data

In [183]:
import os, pickle

In [184]:
vocabs = {
    "userid": user_info.userid.unique(),
    "device": user_info.device.unique(),
    "operating_system": user_info.operating_system.unique(),
    "province": user_info.province.unique(),
    "city": user_info.city.unique(),
    "age": user_info.age.unique(),
    "gender": user_info.gender.unique(),
    "docid": doc_info.docid.unique(),
    "network": show_info.network.unique(),
    "rt": show_info.rt.unique(),
    "rit": show_info.rit.unique(),
}

In [185]:
converter_layers: Dict[str, keras.layers.StringLookup] = {}

for key, vocab in vocabs.items():
    if os.path.exists("./{}.pkl".format(key)):
        print("trying to load {} StringLookup layer...".format(key))
        from_disk = pickle.load(open("./{}.pkl".format(key), "rb"))
        new_layer = keras.layers.StringLookup().from_config(from_disk["config"])
        new_layer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
        new_layer.set_weights(from_disk['weights'])
    else:
        print("trying to create {} StringLookup layer...".format(key))
        new_layer = keras.layers.StringLookup(num_oov_indices=1)
        new_layer.adapt(data=vocabs[key])
        pickle.dump({
            "config": new_layer.get_config(),
            "weights": new_layer.get_weights(),
        }, open("./{}.pkl".format(key), "wb"))
    converter_layers[key] = new_layer

trying to create userid StringLookup layer...
trying to create device StringLookup layer...
trying to create operating_system StringLookup layer...
trying to create province StringLookup layer...
trying to create city StringLookup layer...
trying to create age StringLookup layer...
trying to create gender StringLookup layer...
trying to create docid StringLookup layer...
trying to create network StringLookup layer...
trying to create rt StringLookup layer...
trying to create rit StringLookup layer...


## click sequence

In [186]:
# show_log_cols = ["userid", "docid", "exp_time", "network", "rt", "rit", "click", "reading_time"]

ShowLog = namedtuple("Userinfo", ["userid", "docid", "exp_time", "network", "rt", "rit", "click", "reading_time"])

SEQ_LENGTH = 20

### make seq file

In [187]:
# from collections import deque

# SEQ_LENGTH = 20

# user_seq_buffer: Dict[str, deque] = {}

# with open("/Users/hanshen/work/AI-RecommenderSystem/Dataset/news_data_bigger/sorted_train_data.txt", "r") as show_log_f, open("/Users/hanshen/work/AI-RecommenderSystem/Dataset/news_data_bigger/clk_seq_from_sorted_train_data.txt", 'w') as seq_f:
#     for line in show_log_f:
#         parts = line[:-1].split("\t")
#         show_log = ShowLog(*parts)
#         if show_log.userid not in user_seq_buffer:
#             user_seq_buffer[show_log.userid] = deque(maxlen=SEQ_LENGTH)
#         dq_of_this_user = user_seq_buffer[show_log.userid]
#         seq_str = " ".join(dq_of_this_user)
#         seq_f.write(",".join(show_log[:3] + (seq_str,)) + "\n")
#         if show_log.click == "1":
#             dq_of_this_user.append(show_log.docid)

### make dataset

In [188]:
converter_layers["docid"](tf.constant([""]))

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([0])>

In [444]:
def get_seq_feat(ele):
    ele = tf.strings.split(ele, ",").to_tensor()
    # tf.print("1", ele, tf.shape(ele))
    ele = ele[:, 3]
    # tf.print("2", ele, tf.shape(ele))
    ele = tf.strings.split(ele, " ").to_tensor()
    # tf.print("3", ele, tf.shape(ele))
    res = converter_layers["docid"](ele)
    # tf.print(res, type(res), tf.shape(res))
    return res

seq_dataset = tf.data.TextLineDataset([DATA_DIR + "/clk_seq_from_sorted_train_data.txt"])\
    .batch(1024, drop_remainder=True)\
    .map(get_seq_feat)
    # .unbatch()

In [293]:
for e in seq_dataset.skip(10000).take(2):
    # print(e)
    pass

## other feature

### make feat by pythonic method

In [80]:
# 用户id、文章id、展现时间、网路环境、刷新次数、展现位置、是否点击、消费时长（秒）；


def py_get_user_feat(uid):
    # print(uid.numpy(), type(uid.numpy()))
    user_info = user_info_dict[uid.numpy().decode()]  # Because of this dict access, we cannot do the tensorflow style pipeline
    res = []
    for k, v in zip(user_info._fields, user_info):
        res.append(converter_layers[k](v))
    return res


def tf_get_feat(ele: tf.Tensor):
    ele: tf.Tensor = tf.strings.split(ele, "\t")
    user_info = tf.py_function(py_get_user_feat, [ele[0]], [tf.int64] * len(user_info_cols))
    network = converter_layers["network"](ele[3])
    rt = converter_layers["rt"](ele[4])
    rit = converter_layers["rit"](ele[5])
    feat = tf.concat([user_info] + [[network], [rt], [rit]], axis=0)
    return feat

train_show_log = tf.data.TextLineDataset(["/Users/hanshen/work/AI-RecommenderSystem/Dataset/news_data_bigger/sorted_train_data.txt"])\
    .map(tf_get_feat)

In [88]:
%%timeit
for e in train_show_log.take(1000):
    # print(e)
    pass

4.57 s ± 47.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### build MutableHashTable

In [196]:
# 用户id、设备名称、操作系统、所在省、所在市、年龄、性别；

user_info_table = tf.lookup.experimental.MutableHashTable(
    key_dtype=tf.int64,
    value_dtype=tf.string,
    default_value=["1", "1", "1", "1", "1", "1", "1"],
 )


def build_hash_table(ele):
    # tf.print(ele[:, 0], type(ele), ele.shape)
    key = tf.strings.to_number([ele[:, 0]], out_type=tf.int64)
    # tf.print(key, type(key), key.shape)
    # tf.print(key)
    user_info_table.insert(key, tf.expand_dims(ele, axis=0))
    return 1


user_info_ = tf.data.Dataset.from_tensor_slices(user_info)\
    .batch(1000)\
    .map(build_hash_table)



for e in user_info_:
    # print(e)
    pass

In [201]:
user_info_table.lookup(tf.constant([1001384888], dtype=tf.int64))

<tf.Tensor: shape=(1, 7), dtype=string, numpy=
array([[b'1001384888', b'M2007J22C', b'Android',
        b'\xe6\xb2\xb3\xe5\x8c\x97',
        b'\xe7\x9f\xb3\xe5\xae\xb6\xe5\xba\x84', b'A_40+', b'male']],
      dtype=object)>

### make feat by tf method

In [445]:
# 用户id、文章id、展现时间、网路环境、刷新次数、展现位置、是否点击、消费时长（秒）；

def tf_get_feat_from_table(ele: tf.Tensor):
    ele: tf.Tensor = tf.strings.split(ele, "\t").to_tensor()
    uids = tf.strings.to_number([ele[:, 0]], out_type=tf.int64)
    label = tf.strings.to_number([ele[:, 6]], out_type=tf.int64)
    label = tf.reshape(label, shape=[-1, 1])  # [1, batch] to [batch, 1]
    # tf.print(label, type(label), tf.shape(label))
    values = user_info_table.lookup(uids)
    values = tf.squeeze(values, axis=0)
    # tf.print(values, tf.shape(values))
    feat_dict = {}
    for idx, key in enumerate(user_info_cols):
        tmp = converter_layers[key](values[:, idx])
        feat_dict[key] = tmp
    # user_feat = tf.stack(user_feat, axis=1)
    # tf.print(user_feat)
    docid = converter_layers["docid"](ele[:, 1])
    feat_dict["docid"] = docid
    network = converter_layers["network"](ele[:, 3])
    feat_dict["network"] = network
    rt = converter_layers["rt"](ele[:, 4])
    feat_dict["rt"] = rt
    rit = converter_layers["rit"](ele[:, 5])
    feat_dict["rit"] = rit
    # tf.print(network, tf.shape(network))
    # context_feat = tf.stack([network, rt, rit], axis=1)
    # tf.print(context_feat, tf.shape(context_feat))
    # feat = tf.stack(user_feat + [network, rt, rit], axis=1)
    # tf.print(feat)
    return (feat_dict, label)



train_show_log = tf.data.TextLineDataset(["/Users/hanshen/work/AI-RecommenderSystem/Dataset/news_data_bigger/sorted_train_data.txt"])\
    .batch(1024, drop_remainder=True)\
    .map(tf_get_feat_from_table)
    # .unbatch()

In [370]:
# %%timeit

for e in train_show_log.take(1):
    print(e[0])
    pass

{'userid': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([ 235691,  528795, 1248234, ...,  947192, 1472675, 1472675])>, 'device': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([1842, 2236, 1367, ..., 1137,   96,   96])>, 'operating_system': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([2, 2, 2, ..., 2, 2, 2])>, 'province': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([ 14,  69, 105, ...,  52,  84,  84])>, 'city': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([295, 179, 228, ..., 256, 113, 113])>, 'age': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([2, 3, 2, ..., 4, 2, 2])>, 'gender': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([1, 1, 1, ..., 1, 1, 1])>, 'docid': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([581525, 562325, 583990, ..., 518838, 530587, 549283])>, 'network': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([1, 4, 4, ..., 4, 4, 4])>, 'rt': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([214, 214, 102, ..., 214,

### concat show log and req feat

In [446]:
def merge_sparse_with_seq_into_one_dict(a, seq):
    sparse, label = a
    for _, value in sparse.items():
        value.set_shape([1024])
    sparse["docid_seq"] = seq
    return (sparse, label)

dataset = tf.data.Dataset.zip((train_show_log, seq_dataset)).map(merge_sparse_with_seq_into_one_dict)

In [438]:
for e in dataset.take(1):
    print(e)
    pass

({'userid': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([ 235691,  528795, 1248234, ...,  947192, 1472675, 1472675])>, 'device': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([1842, 2236, 1367, ..., 1137,   96,   96])>, 'operating_system': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([2, 2, 2, ..., 2, 2, 2])>, 'province': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([ 14,  69, 105, ...,  52,  84,  84])>, 'city': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([295, 179, 228, ..., 256, 113, 113])>, 'age': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([2, 3, 2, ..., 4, 2, 2])>, 'gender': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([1, 1, 1, ..., 1, 1, 1])>, 'docid': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([581525, 562325, 583990, ..., 518838, 530587, 549283])>, 'network': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([1, 4, 4, ..., 4, 4, 4])>, 'rt': <tf.Tensor: shape=(1024,), dtype=int64, numpy=array([214, 214, 102, ..., 214

In [448]:
# train_dataset, test_dataset = tf.keras.utils.split_dataset(dataset.take(1_000_000), left_size=0.9)
# dataset has approximately 185319 batches.

train_dataset = dataset.take(100_000)
test_dataset = dataset.skip(100_000).take(30_000)

### make negative sample

In [216]:
tf.constant(["1"]) == tf.constant(["1"])

<tf.Tensor: shape=(1,), dtype=bool, numpy=array([ True])>

In [225]:
# 用户id、文章id、展现时间、网路环境、刷新次数、展现位置、是否点击、消费时长（秒）；

# doc_click_freq = tf.lookup.experimental.MutableHashTable(
#     key_dtype=tf.int64,
#     value_dtype=tf.int64,
#     default_value=0,
# )


# def split_ele(ele):
#     return tf.strings.split(ele, "\t")


# def is_click(ele):
#     res = (ele[6] == tf.constant(["1"]))
#     # tf.print(res[0], type(res[0]))
#     return res[0]


# click_log_dataset = tf.data.TextLineDataset(DATA_DIR + "/sorted_train_data.txt")\
#     .map(split_ele)
#     .filter(is_click)

# for e in click_log_dataset.take(10):
#     print(e)
#     pass

tf.Tensor(b'2382994490\t462336897\t1624546614290\t2\t0\t16\t1\t132', shape=(), dtype=string)
tf.Tensor(b'1499193026\t461957659\t1624546617762\t2\t2\t34\t1\t192', shape=(), dtype=string)
tf.Tensor(b'2226310380\t462616014\t1624546621702\t5\t1\t14\t1\t718', shape=(), dtype=string)
tf.Tensor(b'2230458112\t462830838\t1624546621712\t5\t0\t13\t1\t163', shape=(), dtype=string)
tf.Tensor(b'2216481992\t462753067\t1624546621919\t5\t0\t12\t1\t386', shape=(), dtype=string)
tf.Tensor(b'2439786870\t462579914\t1624546626211\t2\t0\t11\t1\t141', shape=(), dtype=string)
tf.Tensor(b'2391075066\t462456982\t1624546638325\t5\t2\t14\t1\t43', shape=(), dtype=string)
tf.Tensor(b'2391075066\t462573736\t1624546638325\t5\t2\t19\t1\t217', shape=(), dtype=string)
tf.Tensor(b'1499904554\t461849959\t1624546648071\t2\t12\t7\t1\t99', shape=(), dtype=string)
tf.Tensor(b'1251029748\t462861190\t1624546652702\t2\t0\t13\t1\t151', shape=(), dtype=string)


In [213]:
samples = tf.random.categorical(tf.math.log([[0.7, 0.3]]), 100)

In [215]:
tf.reduce_sum(samples)

<tf.Tensor: shape=(), dtype=int64, numpy=27>

# DEFINE MODEL

In [449]:
import importlib
import model
importlib.reload(model)

sparse_configs = []
doc_embedding = None

for key, layer in converter_layers.items():
    if key == "docid":
        doc_embedding = model.EmbeddingConfig(key, 16, layer.vocabulary_size())
        continue
    sparse_configs.append(
        model.EmbeddingConfig(key, 16, layer.vocabulary_size())
    )

In [450]:
checkpoint_filepath = "./ckpt/{epoch:02d}-{val_auc:.2f}.hdf5"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor="val_auc",
    mode="max",
    save_best_only=True,
    verbose=1,
)

In [451]:
import datetime

log_dir = "./logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [452]:
youtubednn = model.YouTubeDNN(sparse_configs, doc_embedding)
youtubednn.compile(
    optimizer='adam',
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=keras.metrics.AUC(),
)

In [453]:
youtubednn.fit(
    x = train_dataset,
    validation_data=test_dataset,
    callbacks=[
        model_checkpoint_callback,
        tensorboard_callback,
    ]
)

 100000/Unknown - 21020s 210ms/step - loss: 0.3607 - auc_21: 0.5197

KeyError: 'Failed to format this callback filepath: "./ckpt/{epoch:02d}-{val_auc:.2f}.hdf5". Reason: \'val_auc\''

In [395]:
tf.constant([[1,2,3]]) * tf.constant([[1,2,3]])

<tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[1, 4, 9]], dtype=int32)>

In [396]:
tf.tensordot(tf.constant([[1,2,3]]), tf.constant([[1,2,3]]), axes=[[1], [1]])

<tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[14]], dtype=int32)>