In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

In [None]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train.head()


In [None]:
train.info()


In [None]:
train.id.nunique()


In [None]:
train.language.nunique()


In [None]:
train.language.unique()


In [None]:
test.head()

In [None]:
sns.countplot(x='label',data=train)


In [None]:
plt.figure(figsize=(15,8))
sns.countplot(x='language',data=train)

In [None]:
sns.countplot(x='lang_abv',data=train)


In [None]:
df_train_en = train.loc[train.language=='English'].copy()
print(df_train_en.shape)
df_train_fr = train.loc[train.language=='French'].copy()
print(df_train_fr.shape)
df_train_th = train.loc[train.language=='Thai'].copy()
print(df_train_th.shape)
df_train_tr = train.loc[train.language=='Turkish'].copy()
print(df_train_tr.shape)
df_train_ur = train.loc[train.language=='Urdu'].copy()
print(df_train_ur.shape)
df_train_ru = train.loc[train.language=='Russian'].copy()
print(df_train_ru.shape)
df_train_bg = train.loc[train.language=='Bulgarian'].copy()
print(df_train_bg.shape)
df_train_de = train.loc[train.language=='German'].copy()
print(df_train_de.shape)
df_train_ar = train.loc[train.language=='Arabic'].copy()
print(df_train_ar.shape)
df_train_zh = train.loc[train.language=='Chinese'].copy()
print(df_train_zh.shape)
df_train_hi = train.loc[train.language=='Hindi'].copy()
print(df_train_hi.shape)
df_train_sw = train.loc[train.language=='Swahili'].copy()
print(df_train_sw.shape)
df_train_vi = train.loc[train.language=='Vietnamese'].copy()
print(df_train_vi.shape)
df_train_es = train.loc[train.language=='Spanish'].copy()
print(df_train_es.shape)
df_train_ei = train.loc[train.language=='Greek'].copy()
print(df_train_ei.shape)

In [None]:
text = " ".join(txt for txt in train.hypothesis)
stopwords = set(STOPWORDS)

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
text = " ".join(txt for txt in train.hypothesis)
stopwords = set(STOPWORDS)

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
train['premise_len'] = pd.to_numeric(list(map(len, train.premise)))
train['hypothesis_len'] = pd.to_numeric(list(map(len, train.hypothesis)))

In [None]:
train.premise_len.plot(kind='hist')
plt.title('Length of premise')
plt.grid()
plt.show()

In [None]:
train.hypothesis_len.plot(kind='hist')
plt.title('Length of hypothesis - Training (English)')
plt.grid()
plt.show()

In [None]:
#!pip install transformers
from transformers import TFAutoModel,AutoTokenizer
import tensorflow as tf
#!pip install sentencepiece


In [None]:
tokenizer=AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')
train_enc=tokenizer.batch_encode_plus(train[['premise','hypothesis']].values.tolist(),padding='max_length',max_length=100,truncation=True,return_attention_mask=True)
test_enc=tokenizer.batch_encode_plus(test[['premise','hypothesis']].values.tolist(),padding='max_length',max_length=100,truncation=True,return_attention_mask=True)
train_tf1=tf.convert_to_tensor(train_enc['input_ids'],dtype=tf.int32)
train_tf2=tf.convert_to_tensor(train_enc['attention_mask'],dtype=tf.int32)
train_input={'input_word_ids':train_tf1,'input_mask':train_tf2}
test_tf1=tf.convert_to_tensor(test_enc['input_ids'],dtype=tf.int32)
test_tf2=tf.convert_to_tensor(test_enc['attention_mask'],dtype=tf.int32)
test_input={'input_word_ids':test_tf1,'input_mask':test_tf2}

In [None]:
train_enc[100]

In [None]:
with strategy.scope():
    input_ids = tf.keras.Input(shape = (100,), dtype = tf.int32,name='input_word_ids') 
    input_mask=tf.keras.Input(shape=(100,),dtype=tf.int32,name='input_mask')    
    roberta = TFAutoModel.from_pretrained('joeddav/xlm-roberta-large-xnli')
    roberta = roberta([input_ids,input_mask])[0]
    out = tf.keras.layers.GlobalAveragePooling1D()(roberta)
    out = tf.keras.layers.Dense(3, activation = 'softmax')(out)
    model = tf.keras.Model(inputs = [input_ids,input_mask], outputs = out)
    model.compile(
                        optimizer = tf.keras.optimizers.Adam(lr = 1e-5), 
                        loss = 'sparse_categorical_crossentropy', 
                        metrics = ['accuracy']) 
    model.summary()

In [None]:
strategy.num_replicas_in_sync


In [None]:
es=tf.keras.callbacks.EarlyStopping(patience=2,restore_best_weights=True)
history=model.fit(train_input,train.label,validation_split=0.2,epochs=20,batch_size=10*strategy.num_replicas_in_sync,callbacks=[es],verbose=1)

In [None]:
plt.show()

In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.figure(figsize=(100, 100))
plt.savefig("Model Loss")
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.figure(figsize=(100, 100))
plt.savefig("Model Accuracy v4")
plt.show()

## Finding R2 Score


In [None]:
from sklearn.metrics import r2_score

In [None]:
## Finding R2 Score
y_pred = [np.argmax(i) for i in model.predict(train_input)]
y_true = train['label'].to_numpy()
r2_score(y_true, y_pred)


In [None]:
pred=[np.argmax(i) for i in model.predict(test_input)]
pd.DataFrame(pred).value_counts()

## Saving Output

In [None]:
pd.DataFrame({'id':test.id,
              'prediction':pred}).to_csv('output.csv',index=False)

In [None]:
final_pred=pd.DataFrame({'id':test.id,
              'prediction':pred})

In [None]:
final_pred.head()