In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf 

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import TimeDistributed,Flatten,GlobalAveragePooling1D
from tensorflow.keras.layers import Input,Dense
from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt

import transformers
from transformers import TFAutoModel,AutoTokenizer

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() 
    
    
    
print('Number of replicas:', strategy.num_replicas_in_sync)

Number of replicas: 8


In [2]:
df=pd.read_csv('../input/contradictory-my-dear-watson/train.csv')
df_test=pd.read_csv('../input/contradictory-my-dear-watson/test.csv')
df.head()


Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [3]:
print(f'languages present in dataset are : {list(df.language.unique())}')
print(f'Total no. of unique languages are : {df.language.nunique()}')

languages present in dataset are : ['English', 'French', 'Thai', 'Turkish', 'Urdu', 'Russian', 'Bulgarian', 'German', 'Arabic', 'Chinese', 'Hindi', 'Swahili', 'Vietnamese', 'Spanish', 'Greek']
Total no. of unique languages are : 15


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12120 entries, 0 to 12119
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          12120 non-null  object
 1   premise     12120 non-null  object
 2   hypothesis  12120 non-null  object
 3   lang_abv    12120 non-null  object
 4   language    12120 non-null  object
 5   label       12120 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 568.2+ KB


#### Implementing transformer models and finding the best result
1. xlm-roberta-large

In [5]:
tokenizer=AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli")

Downloading:   0%|          | 0.00/734 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

In [6]:
print(tokenizer(df[['premise','hypothesis']].values.tolist()[:3],padding=True))

{'input_ids': [[0, 136, 6097, 24626, 3542, 90698, 23, 26168, 1916, 70, 1940, 464, 91736, 5, 2, 2, 581, 91736, 126809, 23, 70, 1940, 464, 3542, 3884, 25842, 678, 6097, 24626, 23, 7086, 5, 2, 1, 1, 1, 1, 1], [0, 32255, 621, 37348, 450, 642, 148, 56644, 133, 678, 23, 41361, 94407, 111, 27165, 11037, 7, 4, 2412, 2804, 5, 6, 2, 2, 109613, 13, 94407, 621, 959, 28897, 3674, 47, 4488, 98, 6097, 37348, 5, 2], [0, 5581, 69332, 37899, 3739, 91362, 9, 16161, 12401, 773, 123683, 211033, 807, 405, 41, 1647, 25, 6011, 1033, 8, 2973, 5, 2, 2, 821, 25, 6011, 395, 164, 104, 25, 2263, 58875, 23062, 38944, 19667, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [7]:
tokens=tokenizer(df[['premise','hypothesis']].values.tolist()[:3])['input_ids']
for sen in tokens:
    print(tokenizer.decode(sen))

<s> and these comments were considered in formulating the interim rules.</s></s> The rules developed in the interim were put together with these comments in mind.</s>
<s> These are issues that we wrestle with in practice groups of law firms, she said. </s></s> Practice groups are not permitted to work on these issues.</s>
<s> Des petites choses comme celles-là font une différence énorme dans ce que j'essaye de faire.</s></s> J'essayais d'accomplir quelque chose.</s>


 Seems Similar to NSP (Next sentence prediction) approach for training

In [8]:
y_label_data=df['label'].values
df.label.nunique()

3

In [9]:
tokenizer.batch_encode_plus(df[['premise','hypothesis']][:2].values.tolist(),padding='max_length',max_length=100,truncation=True,return_attention_mask=True)

{'input_ids': [[0, 136, 6097, 24626, 3542, 90698, 23, 26168, 1916, 70, 1940, 464, 91736, 5, 2, 2, 581, 91736, 126809, 23, 70, 1940, 464, 3542, 3884, 25842, 678, 6097, 24626, 23, 7086, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 32255, 621, 37348, 450, 642, 148, 56644, 133, 678, 23, 41361, 94407, 111, 27165, 11037, 7, 4, 2412, 2804, 5, 6, 2, 2, 109613, 13, 94407, 621, 959, 28897, 3674, 47, 4488, 98, 6097, 37348, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [10]:
# lets move on to preprocessing data before feeding into ML model pipeline
# data engine

class XLM_roberta_engine:
  def __init__(self,max_len,tokenizer):
#     self.max_len=max_len
    self.tokenizer=tokenizer

  def tokenization_and_attention_masking(self,data):

    inputs =self.tokenizer.batch_encode_plus(data[['premise','hypothesis']].values.tolist(),padding='max_length',max_length=150,truncation=True,return_attention_mask=True)
    input_tokens=inputs['input_ids']
    attention_masks=inputs['attention_mask']
    return input_tokens,attention_masks

        

In [11]:
data_engine = XLM_roberta_engine(_ ,tokenizer)
input_tokens,attention_masks=XLM_roberta_engine.tokenization_and_attention_masking(data_engine,df)
test_input_tokens,test_attention_masks=XLM_roberta_engine.tokenization_and_attention_masking(data_engine,df_test)


In [12]:
# # model development
# class Custom_Model_1(tf.keras.Model):
#   def __init__(self,units,training):
#     super(Custom_Model_1,self).__init__()
#     self.units=units
#     self.training=training
# #     self.dense_1=tf.keras.layers.Dense(self.units,activation='relu',trainable=self.training,name='dense_layer_1')
#     self.bert1=bert_layer1
#   def call(self,input1):
#     embedding1=self.bert1(input1)[0]
# #     cls1=embedding1[:,0,:]
    
#     return embedding1


In [13]:
# class Custom_Model_2(tf.keras.Model):
#   def __init__(self,units,training):
#     super(Custom_Model_2,self).__init__()
#     self.units=units
#     self.training=training
# #     self.dense_2=tf.keras.layers.Dense(self.units,activation='relu',trainable=self.training,name='dense_layer_11')
#     self.bert2=bert_layer2

#   def call(self,input2):
#     embedding2=self.bert2(input2)[0]
# #     cls2=embedding2[:,0,:]
    
#     return embedding2

In [14]:
# class Siamese_Model(tf.keras.Model):
#     def __init__(self,units,training):
#       super(Siamese_Model,self).__init__()
#       self.units=units
#       self.training=training
#       self.out1=Custom_Model_1(self.units,self.training)
#       self.out2=Custom_Model_2(self.units,self.training)
#       self.dropout=tf.keras.layers.Dropout(0.2)
#       self.dense_s=tf.keras.layers.Dense(256,activation='relu',trainable=self.training,name='dense_layer_s')
#       self.dense=tf.keras.layers.Dense(3,activation='softmax',trainable=self.training,name='output_layer')

#     def call(self,inputs):
#       inp1=self.out1(inputs[0])
#       inp2=self.out2(inputs[1])
#       concat=tf.keras.layers.concatenate([inp1,inp2])
#       x=self.dense_s(concat)
#       x=self.dropout(x)
#       x=self.dense(x)
      
#       return x



In [15]:
with strategy.scope():
#xlm-roberta model

    xlm_roberta_layer=TFAutoModel.from_pretrained('joeddav/xlm-roberta-large-xnli')

        
#     model=Siamese_Model(256,True)
#     initializer=model([hy_inputs_ids[:3],prem_inputs_ids[:3]])
# f_log=model(hy_inputs_ids[:10],hy_attention_masks[:10],prem_inputs_ids[:10],prem_attention_masks[:10])  #simple un-trained logits

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some layers from the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing TFXLMRobertaModel: ['classifier']
- This IS expected if you are initializing TFXLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaModel were initialized from the model checkpoint at joeddav/xlm-roberta-large-xnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [16]:
# AUTO = tf.data.experimental.AUTOTUNE
# batch_size=16*strategy.num_replicas_in_sync
# train_size=int(0.8*len(hy_inputs_ids))
# hy_inputs_ids, hy_attention_masks, prem_inputs_ids, prem_attention_masks, label_data = tf.convert_to_tensor(hy_inputs_ids), tf.convert_to_tensor(hy_attention_masks), tf.convert_to_tensor(prem_inputs_ids), tf.convert_to_tensor(prem_attention_masks), tf.convert_to_tensor(label_data)
# hy_data=tf.data.Dataset.from_tensor_slices(((hy_inputs_ids[:train_size],hy_attention_masks[:train_size]),label_data[:train_size]))
# prem_data=tf.data.Dataset.from_tensor_slices(((prem_inputs_ids[:train_size],prem_attention_masks[:train_size]),label_data[:train_size]))

# validation_hy_data=tf.data.Dataset.from_tensor_slices(((hy_inputs_ids[train_size:],hy_attention_masks[train_size:]),label_data[train_size:]))
# validation_prem_data=tf.data.Dataset.from_tensor_slices(((prem_inputs_ids[train_size:],prem_attention_masks[train_size:]),label_data[train_size:]))

# hy_data = hy_data.shuffle(buffer_size=1024).batch(batch_size,drop_remainder=True)
# prem_data = prem_data.shuffle(buffer_size=1024).batch(batch_size,drop_remainder=True)

# validation_hy_data = validation_hy_data.shuffle(buffer_size=1024).batch(batch_size,drop_remainder=True)
# validation_prem_data = validation_prem_data.shuffle(buffer_size=1024).batch(batch_size,drop_remainder=True)

In [17]:
T=np.array(input_tokens).shape[-1]

In [18]:
def model():

    input_ids=Input(shape=(T,),dtype = tf.int32)
    atten_masks=Input(shape=(T,),dtype = tf.int32)
    embedding=xlm_roberta_layer([input_ids,atten_masks])[0]
    com = GlobalAveragePooling1D()(embedding)
    out=Dense(3,activation='softmax')(com)     
    
    loss_fn=tf.keras.losses.CategoricalCrossentropy(from_logits=False)
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5)
    
    model=Model(inputs=[input_ids,atten_masks],outputs=out)
    model.compile(optimizer=optimizer,loss=loss_fn, metrics=['accuracy'])
    
    return model

In [19]:
with strategy.scope():
    model=model()

In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
tfxlm_roberta_model (TFXLMRober TFBaseModelOutputWit 559890432   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 1024)         0           tfxlm_roberta_model[0][0]    

In [21]:
y_train_input=y_label_data
y_train_input=to_categorical(y_train_input,num_classes=3)
y_train_input.shape

(12120, 3)

In [22]:
input_tokens,attention_masks=tf.convert_to_tensor(input_tokens,dtype=tf.int32),tf.convert_to_tensor(attention_masks,dtype=tf.int32)
test_input_tokens,test_attention_masks=tf.convert_to_tensor(test_input_tokens,dtype=tf.int32),tf.convert_to_tensor(test_attention_masks,dtype=tf.int32)
y_train_input=tf.convert_to_tensor(y_train_input)

In [23]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=2,restore_best_weights=True)

r=model.fit([input_tokens,attention_masks],y_train_input,batch_size=16*strategy.num_replicas_in_sync,callbacks=[early_stop],validation_split=0.2,epochs=5)

Epoch 1/5


  num_elements)


Epoch 2/5
Epoch 3/5


KeyboardInterrupt: 

In [24]:
model.evaluate([input_tokens,attention_masks],y_train_input)



[0.08805657923221588, 0.9778053164482117]

In [25]:
plt.plot(r.history['loss'],label='loss')
plt.plot(r.history['accuracy'],label='accuracy')
plt.plot(r.history['val_accuracy'],label='val_accuracy')
plt.legend()

NameError: name 'r' is not defined

In [46]:
pred=model.predict([test_input_tokens,test_attention_masks]).argmax(-1)

In [47]:
pd.DataFrame(np.array(pred)).value_counts()

1    1793
0    1774
2    1628
dtype: int64

In [27]:
sub_data=pd.read_csv('../input/contradictory-my-dear-watson/sample_submission.csv')
y_true=sub_data['prediction'].values
sub_data.head()

Unnamed: 0,id,prediction
0,c6d58c3f69,1
1,cefcc82292,1
2,e98005252c,1
3,58518c10ba,1
4,c32b0d16df,1


In [48]:
pd.DataFrame({'id':list(sub_data['id']) , 'prediction': list(pred)}).to_csv('submission.csv',index=False)

In [30]:

# def custom_train(epochs, hy_data, prem_data):
# #   train_loss,train_accuracy=[],[]

#     for epoch in range(1,epochs+1):
#         # batch_data=len(hy_inputs_ids)//batches
#         print('\n')
#         print("for epoch : ", epoch)
#         print('\n')

#         for i,((xh,yh),(xp,yp)) in enumerate(zip(hy_data,prem_data)):
#             with tf.GradientTape() as tape:
#                 logits=model(xh,xp)
#                 loss_val=loss_fn(yh,logits)
#             grad =tape.gradient(loss_val,model.trainable_weights)
#             optimizer.apply_gradients(zip(grad,model.trainable_weights))
#             if i % 50 == 0:
#                   print("Training loss (for one batch) at step",i, float(loss_val))
#                   epoch_loss_avg.update_state(loss_val)
#                   i+=1

#         # train_loss.append(epoch_loss_avg.result())
# #         # train_accuracy.append(epoch_accuracy.result())

# #         print("Avg training loss for epoch {}: {}".format(epoch,epoch_accuracy.result().numpy()))
# #         print('\n')
# #         print("training accuracy for epoch {}: {}".format(epoch,epoch_accuracy.result().numpy()))
# #         print('\n')


# custom_train(10,hy_data,prem_data)


# 