In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

from sklearn.model_selection import train_test_split

In [2]:
main_data=pd.read_csv("sarcasm_v2\\GEN-sarc-notsarc.csv")
data=main_data.copy()
data.drop(columns=['id'],axis=1,inplace=True)
classes = {"notsarc" : 0,"sarc" : 1}
data["class"] = data["class"].map(classes)
data

Unnamed: 0,class,text
0,0,"If that's true, then Freedom of Speech is doom..."
1,0,Neener neener - is it time to go in from the p...
2,0,"Just like the plastic gun fear, the armour pie..."
3,0,So geology is a religion because we weren't he...
4,0,Well done Monty. Mark that up as your first ev...
...,...,...
6515,1,depends on when the baby bird died. run alon...
6516,1,"ok, sheesh, to clarify, women who arent aborti..."
6517,1,so.. eh?? hows this sound? will it fly w...
6518,1,"I think we should put to a vote, the right of ..."


In [3]:
X=data['text']
y=data['class']
X=np.array(X)
y=np.asarray(y).astype('int32')#.reshape((-1,1))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print('Train data len:'+str(len(X_train)))
print('Class distribution\n'+str((pd.Series(y_train)).value_counts()))
print('Valid data len:'+str(len(X_valid)))
print('Class distribution\n'+str((pd.Series(y_valid)).value_counts()))
print('Test data len:'+str(len(X_test)))
print('Class distribution\n'+str((pd.Series(y_test)).value_counts()))

Train data len:4172
Class distribution
0    2087
1    2085
dtype: int64
Valid data len:1044
Class distribution
1    531
0    513
dtype: int64
Test data len:1304
Class distribution
0    660
1    644
dtype: int64


In [5]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [6]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(X_valid.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [7]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_valid
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [8]:
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss) # can also use any keras loss fn

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.fit(train_dataset.shuffle(1000).batch(16),validation_data=val_dataset, epochs=3, batch_size=16)

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
 25/522 [>.............................] - ETA: 4:11:56 - loss: 0.6932

In [None]:
loss, accuracy = model.evaluate(test_dataset)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

In [None]:
model_name='RoBERTa_Sarcasm_GEN_'+str(accuracy)
model.save(model_name, include_optimizer=True)