In [None]:
from sklearn import preprocessing

import os
import pandas as pd
import numpy as np
import pickle
import time
from tqdm import tqdm
import tqdm.notebook as tq
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

In [None]:
book = pd.read_csv('/content/drive/MyDrive/gradpaper/data/book_final.csv')

#### 1. Using BERT for Word Embedding

In [None]:
!pip install sentence_transformers
bert_model = SentenceTransformer('xlm-r-bert-base-nli-stsb-mean-tokens')

In [None]:
def word_embedding_using_BERT(model, data_column):
    embedding_feature = model.encode(data_column)
    print(">>> CHECK SHAPE: ", embedding_feature.shape)
    
    return embedding_feature

In [None]:
title_embedd = word_embedding_using_BERT(bert_model, book['title'])
genre_embedd = word_embedding_using_BERT(bert_model, book['genre'])

In [None]:
## page, year feature

import math 

page_year = book[['page', 'pub_year']]

def sqrt_pow(row):
    row['page_sqrt'] = math.sqrt(float(row['page']))
    row['page_pow'] = math.pow(row['page'], 2)
    
    row['year_sqrt'] = math.sqrt(float(row['year']))
    row['year_pow'] = math.pow(row['page'], 2)
    
    return row

page_year = page_year.progress_apply(lambda x : sqrt_pow(x), axis = 1)
    
# np.save('page_year_embed.npy', page_year)

In [None]:
# major embed

student = pd.read_csv('../student_final.csv')
major_list = sorted(student['college'].unique())

major_embedd = word_embedding_using_BERT(bert_model, major_list)

In [None]:
## image embed
from keras import models
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array

def preprocessed(book_ids, img_path):
    new_images = []
    no_image_ids = []
    
    for i in tq.tqdm(range(len(book_ids))):
        book_file = img_path + book_ids[i] + '.jpg'
        
        try:
            image = load_img(book_file, target_size = (224, 224))
            image = img_to_array(image)
            image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
            image = preprocess_input(image)
            new_images.append(image)
            
        except:
            no_image_ids.append(book_file)
            print(book_file, end = ' ')
        
    return new_images, no_image_ids

def get_features(extractor, image_list):
    img_features = extractor.predict(image_list[0], verbose=0)
    
    for i in tq.tqdm(range(1, len(image_list))):
        features = extractor.predict(image_list[i], verbose=0)
        img_features = np.append(img_features, features, axis = 0)
    
    return img_features


def feature_extract_pipeline(extractor, img_path, book_ids):
    preprocessed_images, no_images = preprocessed(book_ids, img_path)
    image_features = get_features(extractor, preprocessed_images)
    print("CHECK SHAPE: ", image_features.shape)
    
    return image_features

In [None]:
from tensorflow.keras.applications.efficientnet import EfficientNetB0, preprocess_input

base_model = EfficientNetB0(weights='imagenet')
model_eff = models.Model(inputs = base_model.input, outputs = base_model.get_layer('avg_pool').output)

book_ids = book['book_id']
image_path = '/content/drive/MyDrive/gradpaper/data/image/'

image_features = feature_extract_pipeline(model_eff, image_path, book_ids)