In [1]:
import time 
import wandb
import math
import os
import gc
import pandas as pd 
import numpy as np 
from datetime import datetime
import seaborn as sns 
import matplotlib.pyplot as plt 
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import mixed_precision
import tensorflow.keras.backend as K
import transformers 
from transformers import AutoTokenizer,TFAutoModel

mixed_precision.set_global_policy('mixed_float16')  


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
config = {
    "fold_strategy"     :"stratified",
     "n_folds"          : 5,
     "seed"             : 100,
     "model"            :"roberta-base",
     "model_path"       : None,
     "max_lenghth"      : 512,
     "earlys_patience"  :5,
     "reduce_lr_plateau":10,
     "epochs"           :12,
     "batch_size"       :32,
     "model_path"       :'./model_weights',
     "training"         : False
    
     
}

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [3]:
df_test_prompts=pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")
df_test_summaries=pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")
#sample_sub=pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")

In [4]:
df_test=pd.merge(df_test_summaries,df_test_prompts, on="prompt_id")
df_test.head()

Unnamed: 0,student_id,prompt_id,text,prompt_question,prompt_title,prompt_text
0,000000ffffff,abc123,Example text 1,Summarize...,Example Title 1,Heading\nText...
1,222222cccccc,abc123,Example text 3,Summarize...,Example Title 1,Heading\nText...
2,111111eeeeee,def789,Example text 2,Summarize...,Example Title 2,Heading\nText...
3,333333dddddd,def789,Example text 4,Summarize...,Example Title 2,Heading\nText...


In [5]:
tokenizer=AutoTokenizer.from_pretrained('/kaggle/input/commonlit-evaluate-summary/tokenizer/')
config['tokenizer']=tokenizer
SEP=tokenizer.sep_token
df_test['full_text']    =df_test['prompt_question'] + df_test['text']


In [6]:
class Dataset_generator(tf.keras.utils.Sequence):

    def __init__(self,x_id, x_set,config, batch_size):
        self.id=x_id
        self.x= x_set
        self.batch_size = batch_size
        self.config=config

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        low = idx * self.batch_size
        # Cap upper bound at array length; the last batch may be smaller
        # if the total number of items is not a multiple of batch size.
        high = min(low + self.batch_size, len(self.x))
        batch_x = self.x[low:high]
        batch_id=self.id[low:high]
        
        
        x_encoded=config['tokenizer'].batch_encode_plus(
                                batch_x.tolist(),
                                return_tensors='tf',
                                max_length=config['tokenizer'].model_max_length,
                                padding='max_length',
                                truncation=True,
                                return_token_type_ids=False
                            )
        
        return x_encoded,batch_id

      
test_ds=Dataset_generator(df_test['student_id'],df_test['full_text'].values,config,config['batch_size'])

In [7]:
class MeanPooling(tf.keras.layers.Layer):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def call(self, last_hidden_state, attention_mask):
        input_mask_expanded = tf.cast(tf.broadcast_to(tf.expand_dims(attention_mask,-1),last_hidden_state.shape.as_list()),tf.float16)
        sum_embeddings = tf.reduce_sum(last_hidden_state * tf.cast(input_mask_expanded,tf.float16), 1)
        sum_mask = tf.reduce_sum(input_mask_expanded,1)
        sum_mask = tf.clip_by_value(sum_mask, clip_value_min=1e-9,clip_value_max=tf.float32.max)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

    
class CustomModel(tf.keras.Model):

    def __init__(self, config):
        super().__init__()
        
        self.embeddings=TFAutoModel.from_pretrained('/kaggle/input/commonlit-evaluate-summary/embedding_model')
        self.global_pool = MeanPooling()
        self.dense1=tf.keras.layers.Dense(96,activation='relu')
        self.dense2=tf.keras.layers.Dense(12,activation='relu')
        self.batch_norm1=tf.keras.layers.BatchNormalization()
        self.batch_norm2=tf.keras.layers.BatchNormalization()
        self.dropout1=tf.keras.layers.Dropout(0.5)
        self.dropout2=tf.keras.layers.Dropout(0.5)
        self.classifier = tf.keras.layers.Dense(2,dtype='float32')

    def call(self, inputs):
        embeddings = self.embeddings(**inputs)
        x = self.global_pool(embeddings[0],inputs['attention_mask'])
        x = self.batch_norm1(x)
        x = self.dense1(x)
        x = self.batch_norm2(x)
        x = self.dropout1(x)
        x = self.dense2(x)
        x = self.dropout2(x)
        return self.classifier(x)
    
model=CustomModel(config)


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at /kaggle/input/commonlit-evaluate-summary/embedding_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [8]:
for batch,(x_val,x_id) in enumerate(test_ds):
    model(x_val,training=False)
    break
    
model.load_weights('/kaggle/input/commonlit-evaluate-summary/model_weights/model_weights')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7bdd84ffb7c0>

In [9]:
id_list=[]
content_list=[]
wording_list=[]

for batch, (x_val,x_id) in enumerate(test_ds):
    y_pred=model(x_val,training=False)
    
    content_list.extend(y_pred.numpy()[:,0])
    wording_list.extend(y_pred.numpy()[:,1])
    id_list.extend(x_id)
    
    
df = pd.DataFrame(columns=['student_id','content','wording'])    
df['student_id']=id_list
df['content']=content_list
df['wording']=wording_list  

In [10]:
df.to_csv('submission.csv',index=False)