In [None]:
import os
import pandas as pd
import numpy as np
import pickle
import time
from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

from sklearn import preprocessing
from tensorflow.keras.preprocessing.sequence import pad_sequences
from scipy.sparse import csr_matrix 

In [1]:
def labeling_books(book_df, record_df, save_label_dict = False):
    
    ## book labeling by isbn in book_df
    le = preprocessing.LabelEncoder()
    encoder = le.fit(book_df['unique_isbn'])
    book_df['label'] = le.transform(book_df['unique_isbn'])
    
    label_isbn = book_df[['book_id', 'label']]
    label_isbn.set_index('book_id', inplace = True)
    book_label_dict = label_isbn.to_dict()
    book_label_dict = book_label_dict['label']
    
    if save_label_dict == True:
        with open('book_label_dict.pickle', 'wb') as f:
            pickle.dump(book_label_dict, f, pickle.HIGHEST_PROTOCOL)
            
            
    ## filtering records by final book_data
    filtered_record_df = record_df[record_df['book_id'].isin(book_df['book_id'].unique()) == True]
    
    ## add label column to filtered_record_data 
    
    def matching_book_label(row):
        x = book_label_dict[row['book_id']]
        return x

    filtered_record_df['label'] = filtered_record_df.progress_apply(lambda x : matching_book_label(x), axis = 1)
    
    return book_df, filtered_record_df

def make_input_table(student_ids, data, list_len, mode = 'multiclass'):
    input_df = pd.DataFrame()
    
    for sid in student_ids:
        sid_record = data[data['student_id'] == sid]
        books = sid_record['label'].values

        if mode == 'multiclass':
            if len(books) <= (list_len+1):
                book_list = books[:-1]
                pred = books[-1]
            else:
                book_list = books[:list_len]
                pred = books[-1]
        else:
            # in case of multilabel
            if len(books) <= (list_len+1):
                book_list = books[:-1]
                pred = [books[-1]]
            else:
                book_list = books[:list_len]
                pred = books[list_len:]
                
        input_df = input_df.append({'student_id' : str(sid), 'booklist' : book_list, 'pred' : pred}, ignore_index = True)
    
    return input_df

def get_x_n_y(data, student_len, book_len, padding_len = 10, save_x = False, save_y = False, mode = 'multiclass'):
    
    ## get x with padding
    x = pad_sequences(data['booklist'], padding_len)
    
    ## get different y according to its mode
    
    # 1. multiclass, Sparse Categorical Crossentropy
    if mode == 'multiclass':
        y_mat = data[['pred']]
    
    # 2. multilabel, Categorical Crossentropy
    else:
        y_mat = csr_matrix((student_len, book_len), dtype = np.int8).toarray()  
        y = data[['pred']]
        
        for i in range(student_len):

            try:
                indices = y.iloc[i, 0]
                for j in indices:
                    y_mat[i, int(j)] = 1
            except:
                y_mat[i, int(y.iloc[i, 0])] = 1


    if save_x == True:
        save_name = 'input_x_' + mode + '.npy'
        np.save(save_name, x)
    
    if save_y == True:
        save_name = 'input_y_' + mode + '.npy'
        np.save(save_name, y_mat)
    
    return x, y_mat
    

def input_pipeline(mode, labeled_df, book_len, pad_len):
    student_ids = labeled_df['student_id'].unique()
    
    if mode == 'multiclass':
        input_df = make_input_table(student_ids, labeled_df, pad_len, mode = 'multiclass')
        input_x, input_y = get_x_n_y(input_df, len(student_ids), book_len, pad_len, save_x = True, save_y = True, mode = 'multiclass')
    
    elif mode == 'multilabel':
        input_df = make_input_table(student_ids, labeled_df, pad_len, mode = 'multilabel')
        input_x, input_y = get_x_n_y(input_df, len(student_ids), book_len, pad_len, save_x = True, save_y = True, mode = 'multilabel')
    
    return input_df, input_x, input_y      

In [None]:
if __name__ == '__main__':
    
    book_df = pd.read_csv('../final_df.csv')
    record_df = pd.read_csv('../data.csv', index_col = 0)
    student_df = pd.read_csv('../student.csv', index_col = 0)
    
    new_book_df, new_record_df = labeling_books(book_df, record_df)
    
    # 1. multiclass
    input_df1, input_x1, input_y1 = input_pipeline('multiclass', new_record_df, book_df.shape[0], 10)
    
    # 2. multilabel
    input_df2, input_x2, input_y2 = input_pipeline('multilabel', new_record_df, book_df.shape[0], 10)