In [None]:
!pip install transformers -U

In [None]:
!pip install langdetect

In [None]:
!pip install googletrans==3.1.0a0

In [None]:
import os
import gc

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from googletrans import Translator
translator = Translator()
from langdetect import detect

from tqdm import tqdm
tqdm.pandas()

## Helper Functions

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_mask=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen,
        truncation=True
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
def dict_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_mask=True, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen,
        truncation=True
    )
    
    return {
        "input_ids": np.array(enc_di['input_ids']),
        "attention_mask": np.array(enc_di['attention_mask'])
    }

In [None]:
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
def build_model(transformer, max_len=512):
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    sequence_output = transformer.layers[0]({"input_ids": input_word_ids, "attention_mask": attention_mask})[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1)(cls_token)
    
    model = Model(inputs={
        "input_ids": input_word_ids,
        "attention_mask": attention_mask
    }, outputs=out)
    model.compile(Adam(lr=1e-5), loss=root_mean_squared_error)
    
    return model

## TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
train = pd.read_csv("../input/predictingbookratingsint20h/train.csv")

In [None]:
train_translated = pd.read_csv('../input/translated/train_Translated.csv')

In [None]:
train = train[~train['id'].isin(train_translated['id'])]

In [None]:
train = train.append(train_translated)

In [None]:
msk = np.random.rand(len(train)) < 0.92
valid = train[~msk]
train = train[msk]

test = pd.read_csv("../input/predictingbookratingsint20h/test.csv")
sub = pd.read_csv("../input/predictingbookratingsint20h/submission.csv")

In [None]:
# MAX_LEN = 452 - 02618 (3 epochs)
MAX_LEN = 288
MODEL = 'albert-xxlarge-v2'
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 4 * strategy.num_replicas_in_sync
tokenizer = AutoTokenizer.from_pretrained(MODEL)

n_valid_steps = valid.shape[0] // BATCH_SIZE
n_train_steps = train.shape[0] // BATCH_SIZE

In [None]:
%%time 
x_train = dict_encode(list(train.book_desc.values), tokenizer, maxlen=MAX_LEN)
y_train = train.book_rating.values

del train
gc.collect()

x_valid = dict_encode(list(valid.book_desc.values), tokenizer, maxlen=MAX_LEN)
y_valid = valid.book_rating.values
del valid
gc.collect()
x_test = dict_encode(list(test.book_desc.values), tokenizer, maxlen=MAX_LEN)

In [None]:
!free -h

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(4096)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)
del x_train, x_valid, y_train, y_valid
gc.collect()

## Load model into the TPU

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

## Train Model

In [None]:
# Configuration
EPOCHS = 4

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_train_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

## Submission

In [None]:
ep = 4
score = 2800
filepath = f"ch_{MAX_LEN}len_translated_epoch_{ep}_0{score}.h5"

tf.keras.models.save_model(model, filepath, save_format='h5', include_optimizer=False)
sub['book_rating'] = model.predict(test_dataset, verbose=1)
sub.to_csv(f'ch_{MAX_LEN}len_translated_epoch_{ep}_0{score}.csv', index=False)

In [None]:
from IPython.display import FileLink
FileLink(r'ch_288len_translated_epoch_4_02800.h5')