In [2]:
import pandas as pd
import sklearn.model_selection

## Create Dataset
Can choose between 'medium' dataset size for code testing, 'large' for larger dataset, or 'huge' for full dataset (takes a long time to run)

Returns non long tail CSV and long tail CSV from selected datasets

In [3]:
def get_long_tail(tup):
    csv, rows = tup
    df = pd.read_csv(csv, nrows=rows)
    print(df.shape)
    value_counts = df['item_id'].value_counts()
    smallest_100 = value_counts[value_counts < 100]
    smallest_100_tolist = smallest_100.index.tolist()
    test = df[df['item_id'].isin(smallest_100_tolist)]

    train = pd.merge(df, test, on=["user_id", "item_id", "click", "video_category", "gender", "age", "hist_1", "hist_2",
                       "hist_3", "hist_4", "hist_5", "hist_6", "hist_7", "hist_8", "hist_9", "hist_10"], how='outer', indicator=True)
    #train = pd.merge(df, test, how='outer', indicator=True)
    train = train.loc[train['_merge'] == 'left_only']
    train.drop('_merge', axis=1, inplace=True)
    return train.dropna(axis=1, how='all'), test


ds = 120000000
original = 'ctr_data_1M.csv'

small = ('file_name.csv', 50000)
med = (original, 500000)
large = (original, 1200000)
huge = (original, ds)

# Select dataset here
df = get_long_tail(med)
print(df)
nlt, lt = df
#train, test = sklearn.model_selection.train_test_split(df, test_size=0.1) 

#todo: to csv
nlt.to_csv('data_nlt.csv')
lt.to_csv('data_longtail.csv')

(500000, 20)
(        user_id  item_id  click  follow_x  like_x  share_x video_category  \
5             1      311      1         0       0        0              0   
98            1     1555      0         0       0        0              1   
99            1      976      1         0       0        0              0   
102           1     1540      0         0       0        0              1   
107           1     1601      0         0       0        0              0   
...         ...      ...    ...       ...     ...      ...            ...   
512413     3766      242      1         0       0        0              1   
512415     3766     2072      0         0       0        0              1   
512424     3766      113      1         0       0        0              1   
512433     3766     1413      1         0       0        0              1   
512437     3766     4467      1         0       0        0              1   

        watching_times_x  gender  age  hist_1  hist_2  hist_3

## Create model
Ensure the gradients in model/wdl.py are frozen (set to False). This trains the model on the non long tail dataset, freezes the lower layers, and saves the model to a checkpoint file.

In [23]:
%run -i main.py --task_name=ctr --seed=100 --model_name=wdl --dataset_path=data_nlt.csv --train_batch_size=4096 --test_batch_size=4096 --epochs=20 --lr=0.00005

Namespace(add_num_times=2, alpha=0.4, anneal_cap=0.2, bert_mask_prob=0.3, best_metric='NDCG@10', block_num=2, cand_num=100, ch=True, context_window=2, dataset_path='data_nlt.csv', decay_step=5, device='cuda', dilations=[1, 4], dropout=0.3, early_stop=True, embedding_size=128, epochs=20, eval=True, factor_num=128, gamma=0.5, hidden_size=128, hidden_size_list=[128, 128], init_method='default', is_mp=False, is_parallel=False, is_pretrain=1, item_min=10, k=20, kd=False, kernel_size=3, l2_emb=0.0, latent_dim=128, lifelong_eval=True, ll_max_itemnum=0, local_rank=None, loss_type='BPR', lr=5e-05, max_len=20, mess_dropout=0.1, metric_ks=[5, 20], model_name='wdl', mtl_task_num=1, negsample_savefolder='./data/neg_data/', negsample_size=99, node_dropout=0.1, num_embedding=1, num_gpu=1, num_groups=4, num_heads=4, num_items=1, num_labels=1, num_ng=4, num_users=1, optimizer='default', pad_token=0, pretrain_path='', prun_rate=0, re_epochs=20, reg_1=0.0, reg_2=0.0, rho=0.5, sample='random', sample_meth

100%|██████████| 15/15 [00:00<00:00, 313.77it/s]

linears.0.weight Parameter containing:
tensor([[ 3.0481e-05, -9.8775e-05, -2.4937e-04,  ...,  7.6578e-05,
         -1.1914e-06, -5.0379e-05],
        [ 2.1310e-05, -2.5272e-05, -4.3776e-05,  ..., -5.9525e-05,
          1.3884e-04, -1.4521e-04],
        [ 6.4192e-05, -3.7306e-05,  7.7259e-05,  ...,  3.3037e-05,
          1.2796e-04, -2.2204e-06],
        ...,
        [ 1.8808e-04,  1.8227e-04,  4.8448e-05,  ..., -1.7503e-04,
          5.8444e-05,  9.9155e-05],
        [-8.5096e-05, -9.6080e-05,  3.4864e-05,  ..., -3.0924e-05,
          5.9960e-06, -2.3068e-05],
        [-7.2800e-05, -4.4410e-05,  7.7644e-05,  ...,  1.4750e-04,
          4.3578e-05, -1.9624e-04]], device='cuda:0')
linears.0.bias Parameter containing:
tensor([ 3.4099e-02,  1.6013e-02,  4.2084e-02, -3.1188e-02, -5.4217e-03,
         7.0522e-03, -2.8518e-02,  1.2258e-02,  3.4804e-02, -1.8269e-02,
        -1.2179e-02,  3.1905e-02,  1.0603e-02, -3.0831e-02,  2.9666e-02,
         3.0103e-04, -2.5326e-02,  1.7944e-03,  3.3426e-




Epoch 1/20
0s - loss:  0.6920 - auc:  0.5979 - acc:  0.6663 - val_auc:  0.6616 - val_acc:  0.6614
Epoch 2/20
0s - loss:  0.6908 - auc:  0.7400 - acc:  0.6663 - val_auc:  0.6714 - val_acc:  0.6614
Epoch 3/20
0s - loss:  0.6898 - auc:  0.7505 - acc:  0.6663 - val_auc:  0.6736 - val_acc:  0.6614
Epoch 4/20
0s - loss:  0.6887 - auc:  0.7537 - acc:  0.6663 - val_auc:  0.6744 - val_acc:  0.6614
Epoch 5/20
0s - loss:  0.6876 - auc:  0.7551 - acc:  0.6663 - val_auc:  0.6747 - val_acc:  0.6614
Epoch 6/20
0s - loss:  0.6866 - auc:  0.7560 - acc:  0.6663 - val_auc:  0.6750 - val_acc:  0.6614
Epoch 7/20
0s - loss:  0.6855 - auc:  0.7566 - acc:  0.6663 - val_auc:  0.6751 - val_acc:  0.6614
Epoch 8/20
0s - loss:  0.6845 - auc:  0.7571 - acc:  0.6664 - val_auc:  0.6752 - val_acc:  0.6614
Epoch 9/20
0s - loss:  0.6834 - auc:  0.7574 - acc:  0.6664 - val_auc:  0.6753 - val_acc:  0.6614
Epoch 10/20
0s - loss:  0.6824 - auc:  0.7578 - acc:  0.6664 - val_auc:  0.6754 - val_acc:  0.6614
Epoch 11/20
0s - lo

AttributeError: Can't pickle local object 'BaseModel._get_metrics.<locals>.<lambda>'

## Finish training model
Ensure the gradients in model/wdl.py are set to True. This finishes training the model on the long tail items only.

In [19]:
%run -i main.py --task_name=ctr2 --seed=100 --model_name=wdl --dataset_path=data_longtail.csv --train_batch_size=4096 --test_batch_size=4096 --epochs=20 --lr=0.00005

Namespace(add_num_times=2, alpha=0.4, anneal_cap=0.2, bert_mask_prob=0.3, best_metric='NDCG@10', block_num=2, cand_num=100, ch=True, context_window=2, dataset_path='data_longtail.csv', decay_step=5, device='cuda', dilations=[1, 4], dropout=0.3, early_stop=True, embedding_size=128, epochs=20, eval=True, factor_num=128, gamma=0.5, hidden_size=128, hidden_size_list=[128, 128], init_method='default', is_mp=False, is_parallel=False, is_pretrain=1, item_min=10, k=20, kd=False, kernel_size=3, l2_emb=0.0, latent_dim=128, lifelong_eval=True, ll_max_itemnum=0, local_rank=None, loss_type='BPR', lr=5e-05, max_len=20, mess_dropout=0.1, metric_ks=[5, 20], model_name='wdl', mtl_task_num=1, negsample_savefolder='./data/neg_data/', negsample_size=99, node_dropout=0.1, num_embedding=1, num_gpu=1, num_groups=4, num_heads=4, num_items=1, num_labels=1, num_ng=4, num_users=1, optimizer='default', pad_token=0, pretrain_path='', prun_rate=0, re_epochs=20, reg_1=0.0, reg_2=0.0, rho=0.5, sample='random', sample

100%|██████████| 15/15 [00:00<00:00, 148.38it/s]

linears.0.weight Parameter containing:
tensor([[-9.5275e-05, -1.4188e-04, -5.6926e-05,  ...,  5.6338e-05,
         -1.9277e-04,  1.3893e-04],
        [ 7.1253e-05,  4.2341e-05, -7.8404e-05,  ..., -8.5493e-05,
          1.3951e-05,  1.1881e-04],
        [ 8.0746e-05,  9.8401e-05, -9.6685e-05,  ...,  1.0969e-04,
         -9.6909e-05, -8.1658e-05],
        ...,
        [ 9.0726e-05, -1.3722e-04,  1.1560e-06,  ...,  8.7109e-05,
          8.2887e-05,  4.2041e-05],
        [ 9.1943e-05, -6.3253e-05, -8.4764e-05,  ...,  1.0781e-04,
          7.3561e-05, -5.0638e-05],
        [-1.5677e-04,  1.8946e-04, -1.1029e-04,  ..., -3.7671e-05,
         -1.0408e-04,  5.5160e-05]], device='cuda:0')
linears.0.bias Parameter containing:
tensor([-3.3047e-02, -1.7115e-02,  9.7987e-03, -1.2895e-02, -3.9120e-03,
        -2.4593e-02, -7.5216e-04,  4.1958e-02,  3.6821e-02,  2.1513e-03,
        -4.2643e-02,  6.4565e-05,  4.4919e-02,  9.1858e-03, -4.4060e-02,
        -1.9113e-02, -1.4512e-02,  2.6869e-02,  2.5116e-




AttributeError: 'collections.OrderedDict' object has no attribute 'load_state_dict'

In [None]:
!tar chvfz notebook.tar.gz *

ctr_data_1M.csv


In [None]:
'''
%run -i main.py --task_name=ctr --seed=100 --model_name=wdl --dataset_path=ctr_data_1M.csv --train_batch_size=4096 --test_batch_size=4096 --epochs=20 --lr=0.00005

            for name, para in self.dnn.named_parameters():
                print(name, para)
                para.requires_grad = False
'''