In [2]:
import pandas as pd
from tqdm import tqdm
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from RS.utils.dictutils import *
from RS.utils.dataset import transpose_df


dataroot = os.path.join("data")


## Split training and testing data

testing school year: 499*

### User Course

In [None]:
user_course_dense = pd.read_csv(
    os.path.join(dataroot,"course","usercoursedense.csv"),
    encoding='utf-8'
)
user_course_dense['uid'] = user_course_dense['uid'].astype(str)
user_course_dense.head()

In [None]:
user_course_dense.shape

In [None]:
user_course_test = user_course_dense[
    user_course_dense.uid.str.startswith('499')
]
user_course_test.head()

In [None]:
user_course_test.shape

In [None]:
user_course_train = user_course_dense[
    ~user_course_dense.uid.str.startswith('499')
]
user_course_train.head()

In [None]:
user_course_train.shape

In [None]:
user_course_train.to_csv(
    os.path.join(dataroot, "course","train.csv"),
    index = False
)
user_course_test.to_csv(
    os.path.join(dataroot, "course", "test.csv"),
    index = False
)

### Book User

In [None]:
book_user = pd.read_csv(
    os.path.join(dataroot, "book", "cate3_userdense.csv")
)
book_user.head()

In [None]:
test_students = list(
    x for x in list(book_user.columns) if x[:3]=="499"
)
print(len(test_students))

In [None]:
book_user_test = book_user[test_students]
book_user_test.head()

In [None]:
train_students = list(
    x for x in list(book_user.columns) if x[:3]!="499"
)
book_user_train = book_user[train_students]
book_user_train.head()

In [None]:
book_user_train.to_csv(
    os.path.join(dataroot, "book", "cate3_train.csv"),
    index = False
)
book_user_test.to_csv(
    os.path.join(dataroot, "book", "cate3_test.csv"),
    index = False
)

In [None]:
train_des = np.sum(book_user_train.values,axis = 0 )
test_des = np.sum(book_user_test.values, axis = 0)

fig = plt.figure(dpi=800)
plt.subplot(121)
plt.plot(
    list(i for i in range(book_user_train.shape[1])), 
    train_des, label=f"avg:{np.mean(train_des):.0f}"
)
plt.legend()
plt.title("train")

plt.subplot(122)
plt.plot(
    list(i for i in range(book_user_test.shape[1])), 
    test_des,label=f"avg:{np.mean(test_des):.0f}"
)
plt.legend()
plt.title("test")

plt.tight_layout()
plt.savefig(
    os.path.join(dataroot,"book","train_test_count.jpg")
)

#### User book (transpose of above)

In [None]:
book_user_train = pd.read_csv(
    os.path.join("data", "book", "cate3_train.csv")
)
book_user_train.head()

In [3]:
user_book_train = transpose_df(
    df= book_user_train,col_to="uid"
)
user_book_train.to_csv(
    os.path.join(dataroot, "book", "user_cate3_train.csv"),
    index=False
)

#### Groundtruth of testing users

In [3]:
book_user_test= pd.read_csv(
    os.path.join(dataroot, "book", "cate3_test.csv")
)

In [4]:
groundtruth = {}
for ui in tqdm(book_user_test.columns.tolist()):
    gthi = book_user_test[ui].values
    actual = np.where(gthi>0)[0].tolist()
    actual = list(map(lambda x:str(x), actual))
    groundtruth[ui] = actual

writejson(
    dictionary=groundtruth, 
    jsfilepath=os.path.join(
        "result", "testing_user_groundtruth.json"
    )
)

100%|██████████| 1254/1254 [00:00<00:00, 6453.64it/s]
