In [60]:
%load_ext autoreload
%autoreload
import os
from tqdm import tqdm
import numpy as np
import torch
import pandas as pd
from sklearn.preprocessing import normalize
from RS.utils.dataset import Dataset as Mydataset
from RS.utils.dataset import combine_multi_domain
from RS.utils.dataset import user_item_clustering
from RS.utils.mf import ALS_MF
from RS.utils.dictutils import *
dataroot = os.path.join("data")
d = torch.device('cuda:4')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
datafolder = {
    "training_user_course":os.path.join(
        dataroot,"course","train.csv"
    ),
    "training_user_book":os.path.join(
        dataroot,"book","user_cate3_train.csv"
    ),
    "testing_user_course":os.path.join(
        dataroot, "course", "test.csv"
    ),
    "testing_user_book":os.path.join(
        dataroot, "book", "user_cate3_test.csv"
    )
}


dataset = Mydataset(datafolder=datafolder)

build dataset
read training_user_course:data/course/train.csv
..OK
read training_user_book:data/book/user_cate3_train.csv
..OK
read testing_user_course:data/course/test.csv
..OK
read testing_user_book:data/book/user_cate3_test.csv
..OK


## Clustering

In [None]:
user_book_df = dataset.getdata(
    "training_user_book", normalize_value=True
)
user_book_df.head()

In [21]:
user_item_clustering(
    User_Item_df=user_book_df,
    num_clusters=100,
    savingpath=os.path.join(
        "result","CBMF","clustering","bookdataset"
    ),
    d=d
)

random loss: 102.3496293036183


 60%|██████    | 3/5 [00:04<00:02,  1.35s/it, currentbest=0.000, mse=0.000, improve=0.0000, early=3]  


user
(20575, 40)
clutsering ..
OK ..
visualization ..




item
(1000, 40)
clutsering ..
OK ..
visualization ..




In [None]:
user_course_df = combine_multi_domain(
    Dataset=dataset, 
    domains=[
        [
            ("training_user_course",False,False),
            ("testing_user_course",False,False)
        ]
    ]
)
user_course_df.head()

In [16]:
user_item_clustering(
    User_Item_df=user_course_df,
    num_clusters=100,
    savingpath=os.path.join(
        "result","CBMF","clustering","coursedataset"
    ),
    d=d
)

random loss: 102.23353621936171


 80%|████████  | 4/5 [00:40<00:10, 10.03s/it, currentbest=0.003, mse=0.003, improve=0.0000, early=3]  


user
(21829, 40)
clutsering ..
OK ..
visualization ..




item
(7054, 40)
clutsering ..
OK ..
visualization ..




## Cross domain cluster level matrix

In [62]:
book_cluster = loadjson(
    os.path.join(
        "result","CBMF",
        "clustering","bookdataset",
        "itemclustering", "cluster.json"
    )
)

course_cluster = loadjson(
    os.path.join(
        "result","CBMF",
        "clustering","coursedataset",
        "itemclustering", "cluster.json"
    )
)


user_course_cluster = loadjson(
    os.path.join(
        "result","CBMF",
        "clustering","coursedataset",
        "userclustering", "cluster.json"
    )
)

user_book_cluster = loadjson(
    os.path.join(
        "result","CBMF",
        "clustering","bookdataset",
        "userclustering", "cluster.json"
    )
)

user_book_cluster[
    len(user_book_cluster.keys())
] = dataset.getdata("testing_user_book").uid.tolist()

In [63]:
%autoreload
from RS.utils.dataset import cluster_level_matrix

### Book 

In [64]:
courseUser_book = cluster_level_matrix(
    R=dataset.getdata("training_user_book"),
    user_cluster=user_course_cluster,
    item_cluster=book_cluster
)
print(courseUser_book.shape)

(100, 100)


100%|██████████| 100/100 [00:05<00:00, 19.38it/s]

4771/10000
(100, 100)





In [65]:
bookUser_book = cluster_level_matrix(
    R=dataset.getdata("training_user_book"),
    user_cluster=user_book_cluster,
    item_cluster=book_cluster
)

(101, 100)


100%|██████████| 101/101 [00:05<00:00, 19.27it/s]

6482/10100





In [66]:
user_book_cluster_level = np.concatenate(
    [courseUser_book, bookUser_book ],
    axis=0
)
user_book_cluster_level = normalize(
    user_book_cluster_level, norm="l1",axis=1
)
print(user_book_cluster_level.shape)

(201, 100)


In [72]:
np.save(
    os.path.join('result',"CBMF","clustering","bookC"),
    user_book_cluster_level
)

### Course

In [67]:
courseUser_course = cluster_level_matrix(
    R=user_course_df,
    user_cluster=user_course_cluster,
    item_cluster=course_cluster
)

(100, 100)


100%|██████████| 100/100 [00:06<00:00, 14.68it/s]

5229/10000





In [68]:
bookUser_course = cluster_level_matrix(
    R=user_course_df,
    user_cluster=user_book_cluster,
    item_cluster=course_cluster
)

(101, 100)


100%|██████████| 101/101 [00:07<00:00, 13.64it/s]

3335/10100





In [69]:
user_course_cluster_level = np.concatenate(
    [courseUser_course, bookUser_course ],
    axis=0
)
print(user_course_cluster_level.shape)

(201, 100)


In [None]:
np.save(
    os.path.join('result',"CBMF","clustering","courseC"),
    user_course_cluster_level
)

### Combine

In [70]:
cb = np.concatenate(
    [user_book_cluster_level,user_course_cluster_level],
    axis=1
)
print(cb.shape)

(201, 200)


In [71]:
np.save(
    os.path.join('result',"CBMF","clustering","cb"),
    cb
)