# Data Import

In [1]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import TFBertModel, BertTokenizer, DistilBertTokenizer, TFDistilBertModel
from sklearn.model_selection import train_test_split
from sklearn import metrics


2023-05-03 12:55:22.594701: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-03 12:55:22.657680: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-03 12:55:22.658628: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_json('sarcasm.json',lines=True)
labels = data.is_sarcastic.values
sentences = data.headline.values
data.head()


Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


# DistillBert

In [3]:
train_sents,test_sents, train_labels, test_labels  = train_test_split(sentences,labels,test_size=0.1)

In [4]:
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME,do_lower_case = True)

def encoder_dis(sentences):
  ids = []
  for sentence in sentences:
    encoding = tokenizer.encode_plus(
    sentence,
    max_length=16,
    truncation = True,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=False)
    ids.append(encoding['input_ids'])
  return ids

train_ids_dis = encoder_dis(train_sents)
test_ids_dis = encoder_dis(test_sents) 



In [5]:
train_ids_dis = tf.convert_to_tensor(train_ids_dis)
test_ids_dis = tf.convert_to_tensor(test_ids_dis)
test_labels_dis = tf.convert_to_tensor(test_labels)
train_labels_dis = tf.convert_to_tensor(train_labels)

In [6]:
distillbert_encoder = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
input_word_ids = tf.keras.Input(shape=(16,), dtype=tf.int32, name="input_word_ids")  
embedding = distillbert_encoder([input_word_ids])
dense = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(embedding[0])
dense = tf.keras.layers.Dense(128, activation='relu')(dense)
dense = tf.keras.layers.Dropout(0.2)(dense)   
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)    

model_dis = tf.keras.Model(inputs=[input_word_ids], outputs=output)  

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [7]:
model_dis.compile(tf.keras.optimizers.Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy', 'f1score'])

In [9]:
trained_dis = model_dis.fit(x = train_ids_dis, y = train_labels_dis, epochs = 3, verbose = 1, batch_size = 32, validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In-distribution Test

In [33]:
predictions_indis = [1 * (x[0]>=0.5) for x in model_dis.predict(test_ids_dis)]
accuracy = metrics.accuracy_score(test_labels_dis, predictions_indis)
precision = metrics.precision_score(test_labels_dis, predictions_indis)
recall = metrics.recall_score(test_labels_dis, predictions_indis)
f1 = metrics.f1_score(test_labels_dis, predictions_indis)

print("accuracy",accuracy)
print("precision",precision)
print("recall",recall)
print("f1", f1)

accuracy 0.8784067085953878
precision 0.879154078549849
recall 0.8609467455621301
f1 0.8699551569506726


In [9]:
#generate ids for reddit test data
test_data1 = pd.read_csv('clean_reddit.csv')
test_labels1 = test_data1.label.values
test_sentences1 = test_data1.sent.values
test_data1.head()


test_ids1 = encoder_dis(test_sentences1)
test_ids1 = tf.convert_to_tensor(test_ids1)
test_labels_dis1 = tf.convert_to_tensor(test_labels1)
# print("first 5 test_ids1: ", test_ids1[:5])
# print("first 5 test_labels_dis1: ", test_labels_dis1[:5])


#generate ids for tweet test data
test_data2 = pd.read_csv('clean_tweet.csv')
test_labels2 = test_data2.label.values
test_sentences2 = test_data2.sent.values
test_data2.head()


test_ids2 = encoder_dis(test_sentences2)
test_ids2 = tf.convert_to_tensor(test_ids2)
test_labels_dis2 = tf.convert_to_tensor(test_labels2)
# print("first 5 test_ids2: ", test_ids2[:5])
# print("first 5 test_labels_dis2: ", test_labels_dis2[:5])






# Out-of-distribution Tests on Reddits and Tweets

In [32]:
predictions_tweets = [1 * (x[0]>=0.5) for x in model_dis.predict(test_ids2)]
accuracy = metrics.accuracy_score(test_labels_dis2, predictions_tweets)
precision = metrics.precision_score(test_labels_dis2, predictions_tweets)
recall = metrics.recall_score(test_labels_dis2, predictions_tweets)
f1 = metrics.f1_score(test_labels_dis2, predictions_tweets)

print("accuracy",accuracy)
print("precision",precision)
print("recall",recall)
print("f1", f1)


accuracy 0.5152143845089903
precision 0.44155844155844154
recall 0.049132947976878616
f1 0.08842652795838751


# For Baseline Testing (Untrained model)

In [14]:
distillbert_encoder = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
input_word_ids = tf.keras.Input(shape=(16,), dtype=tf.int32, name="input_word_ids")  
embedding = distillbert_encoder([input_word_ids])
dense = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(embedding[0])
dense = tf.keras.layers.Dense(128, activation='relu')(dense)
dense = tf.keras.layers.Dropout(0.2)(dense)   
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)    

model_base = tf.keras.Model(inputs=[input_word_ids], outputs=output)  
model_base.compile(tf.keras.optimizers.Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy'])

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [25]:
predictions_reddit_base = [1 * (x[0]>=0.5) for x in model_base.predict(test_ids1)]
accuracy = metrics.accuracy_score(test_labels_dis1, predictions_reddit_base)
precision = metrics.precision_score(test_labels_dis1, predictions_reddit_base)
recall = metrics.recall_score(test_labels_dis1, predictions_reddit_base)
f1 = metrics.f1_score(test_labels_dis1, predictions_reddit_base)
print("accuracy",accuracy)
print("precision",precision)
print("recall",recall)
print("f1", f1)


accuracy 0.5093914350112697
precision 0.510158013544018
recall 0.9970588235294118
f1 0.6749626679940268


In [29]:
predictions_tweet_base = [1 * (x[0]>=0.5) for x in model_base.predict(test_ids2)]
accuracy = metrics.accuracy_score(test_labels_dis2, predictions_tweet_base)
precision = metrics.precision_score(test_labels_dis2, predictions_tweet_base)
recall = metrics.recall_score(test_labels_dis2, predictions_tweet_base)
f1 = metrics.f1_score(test_labels_dis2, predictions_tweet_base)
print("accuracy",accuracy)
print("precision",precision)
print("recall",recall)
print("f1", f1)


accuracy 0.4778699861687414
precision 0.4782006920415225
recall 0.9985549132947977
f1 0.6467009826860084


In [30]:
predictions_news = [1 * (x[0]>=0.5) for x in model_base.predict(test_ids_dis)]
accuracy = metrics.accuracy_score(test_labels_dis, predictions_news)
precision = metrics.precision_score(test_labels_dis, predictions_news)
recall = metrics.recall_score(test_labels_dis, predictions_news)
f1 = metrics.f1_score(test_labels_dis, predictions_news)
print("accuracy",accuracy)
print("precision",precision)
print("recall",recall)
print("f1", f1)

accuracy 0.4744933612858141
precision 0.47335203366058903
recall 0.9985207100591716
f1 0.6422454804947668
