In [None]:
#Code for installing important libraries
!pip install transformers
!pip install simpletransformers
! pip install -U git+https://github.com/huggingface/transformers.git
! pip install -U git+https://github.com/huggingface/accelerate.git

In [None]:
#Code for importing libraries important to code
import json
import pandas as pd
from sklearn.metrics import accuracy_score
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [3]:
#Code for intializing data from train and validation datasets.
def load_dataset(file_name):
    data = []
    with open(file_name, encoding='utf8') as f:
        for line in f:
            example = json.loads(line)
            post_text = example['postText'][0]
            title = example['targetTitle']
            paragraphs = ' '.join(example['targetParagraphs'])
            label = example['tags'][0] if 'tags' in example else None
            if label in ['phrase', 'multi', 'passage']:
                data.append({'text': post_text + ' - ' + title + paragraphs, 'labels': label})
    return pd.DataFrame(data)
train_data = load_dataset(r'Your location where you stored train dataset in .jsonl format')
validation_data = load_dataset(r'Your location where you stored validation dataset in .jsonal format')
train_data['labels'] = train_data['labels'].replace(['phrase', 'multi', 'passage'],[0,1,2])
validation_data['labels'] = validation_data['labels'].replace(['phrase', 'multi', 'passage'],[0,1,2]) 

In [5]:
#Code for checking total number of elements per label
train_data.groupby('labels').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1367,1364,Science Says This Is The Best Movie To Watch O...,2
1,559,559,Passion is overrated — 7 work habits you need ...,1
2,1274,1274,"Wes Welker Wanted Dinner With Tom Brady, But P...",1


In [8]:
#Code for removing stopwords from the train and validation dataset
import numpy as np
from gensim.parsing.preprocessing import remove_stopwords
def train_process(df):
    list1=[]
    for x in df['text']:
        list1.append([remove_stopwords(x)])
    return np.asarray(list1)
X_train = train_process(train_data)
X_val = train_process(validation_data)
        
print(len(X_train))
print(len(X_val))

3200
400


In [9]:
#Code for removing special characters from the train and validation dataset
characters = ['!','"','#','$','%','&','(',')','*','+','/',':',';','<','=','>','@','^','`','|','~','\t','[',']','{','}','\\','.','-']
for i in X_train:
    for j in characters:
        i[0] = i[0].replace(j,"")

for i in X_val:
    for j in characters:
        i[0] = i[0].replace(j,"")
        
print(len(X_train))
print(len(X_val))        

3200
400


In [10]:
#Code fo preprocessing y_train and y_val 
import numpy as np
from keras.utils import to_categorical
def label_process(df):
    list1=[]
    for x in df['labels']:
        list1.append(x)
    return list1
Y_train = to_categorical(np.asarray(label_process(train_data)), num_classes=3)
Y_val = to_categorical(np.asarray(label_process(validation_data)), num_classes=3)
print(Y_train.shape)
print(Y_val)

(3200, 3)
[[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [11]:
#Code for inatializing bert model (1024 embedding size) to use it as encoder for classification model.
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/4")

In [12]:
#Code for creating embeddings through bert and intializing decoder using bidirectional GRU (the main structure of the model)
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

l1 = tf.keras.layers.Dense(512, activation='leaky_relu', name='intermediate_layer')(outputs['sequence_output'])
l3 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True, kernel_regularizer=None))(l1)
l6 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True, kernel_regularizer=None))(l3)
l7 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True, kernel_regularizer=None))(l6)
l8 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True, kernel_regularizer=None))(l7)
l10 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, kernel_regularizer=None))(l8)
l12 = tf.keras.layers.Dense(128, activation='leaky_relu')(l10)
l = tf.keras.layers.Dense(3, activation='softmax', name="output")(l12)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

In [14]:
#to intialize metrics for evaluation of the model and defining loss and other metrics important for training of the model
METRICS = [
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer=tf.keras.optimizers.experimental.AdamW(learning_rate=0.0005),
              loss='categorical_crossentropy',
              metrics=METRICS)

In [15]:
#Checkpoint to store model at best recall value on validation set on defined path.
checkpoint_filepath = 'Your path where you want to store your model'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_recall',
    mode='max',
    verbose=1,
    save_best_only=True)

In [None]:
model.fit(X_train, Y_train, epochs=10, batch_size=5, validation_data=(X_val, Y_val), callbacks=[model_checkpoint_callback])

In [17]:
#Code for intializing data from test datasets.
def load_dataset(file_name):
    data = []
    with open(file_name, encoding='utf8') as f:
        for line in f:
            example = json.loads(line)
            post_text = example['postText'][0]
            title = example['targetTitle']
            id = example['id']
            paragraphs = ' '.join(example['targetParagraphs'])
            # label = example['tags'][0] if 'tags' in example else None
            # if label in ['phrase', 'multi', 'passage']:
            data.append({'id': id, 'text': post_text + ' - ' + title + paragraphs})
    return pd.DataFrame(data)
test_data = load_dataset('/kaggle/input/clickbait-detection-msci641-s23/test.jsonl')

In [19]:
#Code for removing stopwords from the test dataset
def preprocess(df):
    list1=[]
    for x in df['text']:
        list1.append([remove_stopwords(x)])
    return np.asarray(list1)
X_test = preprocess(test_data)
print(len(X_test))

400


In [20]:
#Code for removing special characters from test datasets
characters = ['!','"','#','$','%','&','(',')','*','+','/',':',';','<','=','>','@','^','`','|','~','\t','[',']','{','}','\\','.','-']
for i in X_test:
    for j in characters:
        i[0] = i[0].replace(j,"")
print(len(X_test))            

400


In [21]:
#Code for preloading trained model from model with best recall
model = tf.keras.models.load_model('Your path where you stored your model')
y_pred = model.predict(X_test)



In [22]:
y_pred

array([[0.83839005, 0.09148254, 0.07012741],
       [0.05884722, 0.05188123, 0.88927156],
       [0.3443577 , 0.11773381, 0.5379085 ],
       ...,
       [0.7218329 , 0.18183222, 0.09633487],
       [0.1109169 , 0.04393053, 0.8451526 ],
       [0.83313924, 0.10784314, 0.05901757]], dtype=float32)

In [23]:
#Code for converting predicted values to actual labels
lst1=[]
for i in y_pred:
    if i[0]>i[1] and i[0]>i[2]:
        lst1.append('phrase')
    if i[1]>i[0] and i[1]>i[2]:
        lst1.append('multi')
    if i[2]>i[0] and i[2]>i[1]:
        lst1.append('passage')
print(len(lst1))        
df_1 = pd.DataFrame(lst1, columns=['spoilerType'])
print(df_1)

400
    spoilerType
0        phrase
1       passage
2       passage
3        phrase
4        phrase
..          ...
395     passage
396      phrase
397      phrase
398     passage
399      phrase

[400 rows x 1 columns]


In [25]:
final_df = pd.concat([test_data, df_1], axis=1)
final_df_1=final_df.drop(['text'], axis=1)
print(final_df_1)

      id spoilerType
0      0      phrase
1      1     passage
2      2     passage
3      3      phrase
4      4      phrase
..   ...         ...
395  395     passage
396  396      phrase
397  397      phrase
398  398     passage
399  399      phrase

[400 rows x 2 columns]


In [27]:
final_df_1.to_csv('prediction_task1_GRU.csv', index=False)