In [1]:
import math
import gc
import numpy as np
import pandas as pd
from time import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable

from importlib import reload
import layers
reload(layers)

from data import Dataset, MiniBatcher
from loss import bpr_loss, hinge_loss, top1_loss, bpr2_loss
from metric import precision_recall
from layers import SeqCNN
from model import evaluate
from sequence_utils import pad_sequences

# Constants

In [2]:
MAX_ITEM = 4698 + 1
MAX_SEQ_LENGTH = 200      # Max sequence length
MIN_SEQ_LENGTH = 20       # Min sequence length
EMBEDDING_DIM = 32
ACTIVATE = 'tanh'
LOSS = bpr_loss

NUM_LAYERS = 1

LEARNING_RATE = 1e-2
L2_NORM = 1e-5

EPOCHS = 15
BATCH_SIZE = 128

# Data

In [3]:
from data import Dataset

data = Dataset(csvfile='interactions.csv', 
               num_test_users=500, 
               sample=0.15, 
               cut_item=100)

train, test = data.get_train_test_sequences()

#User: 7499	#Item: 4698
796479 49833


In [4]:
test = test.dropna()
test_sequences = pad_sequences(test.item_sequence, maxlen=MAX_SEQ_LENGTH)
eval_sequences = test.eval_sequence

In [5]:
uniform_prob = np.ones(MAX_ITEM)

pop_prob = uniform_prob.copy()
pop_list = data.train.item.value_counts()
pop_prob[pop_list.index] = pop_list.values
log_pop_prob = np.log1p(pop_prob)

NEGATIVE_PROB = log_pop_prob

# Train

In [6]:
def training(seed=123):
    # Set random state
    random_state = np.random.RandomState(seed)
    seed = random_state.randint(-10**8, 10**8)
    torch.manual_seed(seed)

    # Data
    batcher = MiniBatcher(train, MAX_ITEM, 
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          maxlen=MAX_SEQ_LENGTH,
                          minlen=MIN_SEQ_LENGTH,
                          sampling_prob=NEGATIVE_PROB,
                          random_state=random_state)
    # Model
    _net = SeqCNN(num_items=MAX_ITEM,
                  embedding_dim=EMBEDDING_DIM,
                  num_layers=NUM_LAYERS,
                  activate=ACTIVATE)
    # Optim
    optimizer = optim.Adam(_net.parameters(),
                            weight_decay=L2_NORM,
                            lr=LEARNING_RATE)
    # Loss function
    loss_function = LOSS

    # Iteration
    for i in range(1, EPOCHS+1):
        _net.train(True)
        epoch_loss = 0.0
        start = time()

        # Batch training
        for j, (batch_seq, batch_neg) in enumerate(batcher):  # __iter__
            # Input
            sequences_var     = Variable(torch.from_numpy(batch_seq.astype('int64')))
            neg_sequences_var = Variable(torch.from_numpy(batch_neg.astype('int64')))
            mask = sequences_var > 0

            # Sequence representations
            user_repr, _ = _net.user_representation(sequences_var)

            # Score
            positive_pred = _net(user_repr, sequences_var)
            negative_pred = _net(user_repr, neg_sequences_var)

            optimizer.zero_grad()

            # Loss
            loss = loss_function(positive_pred, negative_pred, mask)
            epoch_loss += loss.data[0]

            # Backward & update
            loss.backward()
            optimizer.step()

    p,r,DCG = evaluate(_net, test_sequences, eval_sequences, MAX_ITEM)
    return p, r, DCG

In [7]:
def several_training(tag):
    
    df = pd.DataFrame(columns=['precision%', 'recall%', 'NDCG'])
    
    for j, seed in enumerate(seeds):
        p,r,d = training(seed)
        f.write("{0}\t{1:.2f}\t{2:.2f}\t{3:.4f}\n".format(j, p*100, r*100, d))
        df.loc[j] = [p*100, r*100, d]
            
    df.to_csv("log/{}.csv".format(tag), 
              index=False, 
              float_format="%g")
    return df

## Negative sampling (unifrom vs log_pop)

In [None]:
probs = {'uniform': uniform_prob, 'log_pop': log_pop_prob}
seeds = [1,10,100,200,500]
explore_prob = {}

with open("log/negative_sampling_prob.log", 'w', 1) as f:
    for name, prob in probs.items():
        NEGATIVE_PROB = prob
        f.write("Negative sampling prob: {}\n".format(name))

        explore_prob[name] = several_training(name)

NEGATIVE_PROB = log_pop_prob  # reset

## Number of layers

In [None]:
layers = [1,2,3]
activates = ['tanh', 'relu']#, 'hybrid']
seeds = [1,10,100,200,500]
explore_layer = {}

with open("log/layer_activation.log", 'w', 1) as f:  # buffsize=1, flush a line each time.
    for layer in layers:
        for activate in activates:
            # New params
            NUM_LAYERS = layer
            ACTIVATE = activate
            f.write("Layer: {}\tActivate: {}\n".format(NUM_LAYERS, ACTIVATE))

            # Create dateframe
            combine = 'conv{}_{}'.format(layer, activate)
            explore_layer[combine] = several_training(combine)
# reset
NUM_LAYERS = 1
ACTIVATE = 'tanh'

In [None]:
for k, df in explore_layer.items():
    print(k, "Conv layers")
    for col, mean, std in zip(df.columns, df.mean(), df.std()):
        print("{0}: \t{1:.4f}+{2:.4f}".format(col, mean, std))

# Embedding dimension

In [None]:
NUM_LAYERS = 1
embeddings = [8, 16, 32, 48, 64, 100]
seeds = [1,10,100,200,500]
explore_emb = {}

with open("log/embedding_dim.log", 'w', 1) as f:
    for e in embeddings:
        EMBEDDING_DIM = e 
        f.write("Eembbeding dim: {}\n".format(EMBEDDING_DIM))

        # Create dateframe
        tag = 'embedding{}'.format(e)
        explore_emb[tag] = several_training(tag)
EMBEDDING_DIM = 32  # reset

In [None]:
df_mean = pd.DataFrame(columns=['precision%', 'recall%', 'NDCG'])
df_std = pd.DataFrame(columns=['precision%', 'recall%', 'NDCG'])
for k, df in explore_emb.items():
    print(k, "Embedding dim")
    df_mean.loc[k] = df.mean()
    df_std.loc[k] = df.std()
    for col, mean, std in zip(df.columns, df.mean(), df.std()):
        print("{0}: \t{1:.4f}+{2:.4f}".format(col, mean, std))

# Loss

In [10]:
NUM_LAYERS = 1
losses = {'top1_loss': top1_loss, 'bpr2_loss': bpr2_loss}
seeds = [1,10,100,200,500]
explore_loss = {}

with open("log/loss_function.log", 'w', 1) as f:
    for name, loss in losses.items():
        LOSS = loss
        f.write("Loss function: {}\n".format(name))

        # Create dateframe
        explore_loss[name] = several_training(name)
LOSS = bpr_loss  # reset

# 万元利息
投入万元，计算不同利率下 N 年后的利息。
$本息 =本金 \times (1+利率\%)^{N}$

In [41]:
def f(x, r, n):
    """" 计算n年后的本息和 """
    return x * (1+r)**n

x = 10000  # 本金一万
rates = [0.04, 0.05, 0.06, 0.1, 0.15] # 利率：4% 5% 6% 10% 15%
years = [1,3,5,10] # 年

interest = pd.DataFrame(index=rates,
                        columns=['1年', '3年', '5年', '10年'])
for rate in rates:
    interest.loc[rate] = [f(x, rate, year)-x for year in years]
interest.astype('int')

Unnamed: 0,1年,3年,5年,10年
0.04,400,1248,2166,4802
0.05,500,1576,2762,6288
0.06,600,1910,3382,7908
0.1,1000,3310,6105,15937
0.15,1500,5208,10113,30455
