In [None]:
!pip install tensorflow



In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json

# Load the data
with open('train.jsonl', 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

# Set up T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Function to generate spoilers using T5 model
def generate_spoiler(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    output = model.generate(input_ids, max_length=150, num_beams=2, length_penalty=0.8, early_stopping=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generate spoilers and print the results
for entry in data:
    uuid = entry['uuid']
    clickbait_post = entry['postText'][0]
    linked_document = '\n'.join(entry['targetParagraphs'])
    prompt = f"Clickbait post: {clickbait_post}\nLinked document: {linked_document}"
    spoiler = generate_spoiler(prompt)

    output = {
        "uuid": uuid,
        "spoiler": spoiler
    }

    print(json.dumps(output))

In [None]:
from google.colab import drive
drive.mount('/content/drive')
train_file_path = '/content/drive/MyDrive/train.jsonl'
val_file_path = '/content/drive/MyDrive/train.jsonl'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import pandas as pd

with open(train_file_path, 'r') as f:
    data = [json.loads(line) for line in f]

df = pd.DataFrame(data)

# Combine 'postText' and 'targetTitle'
df['text'] = df['postText'].apply(lambda x: ' '.join(x)) + ' ' + df['targetTitle']

df['tags'] = df['tags'].apply(lambda x: ','.join(map(str, x)))


# Extracted data
texts = df['text']
labels = df['tags']

print(texts)
print(labels)

0       Wes Welker Wanted Dinner With Tom Brady, But P...
1       NASA sets date for full recovery of ozone hole...
2       This is what makes employees happy -- and it's...
3       Passion is overrated — 7 work habits you need ...
4       The perfect way to cook rice so that it's perf...
                              ...                        
3195    Has Facebook's video explosion completely shak...
3196    Cop Is Eating At A Chili's When Teen Hands Him...
3197    5 popular myths about visible signs of aging t...
3198    You need to see this Twitter account that pred...
3199    GOP congressman comes out for gay marriage Pen...
Name: text, Length: 3200, dtype: object
0       passage
1        phrase
2        phrase
3         multi
4        phrase
         ...   
3195    passage
3196    passage
3197      multi
3198     phrase
3199     phrase
Name: tags, Length: 3200, dtype: object


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
print(encoded_labels)

[1 2 2 ... 0 2 2]


In [None]:
from tensorflow.keras.utils import to_categorical

# Assuming 'encoded_labels' have integer-encoded labels (0, 1, 2)
one_hot_labels = to_categorical(encoded_labels, num_classes=3)
print(one_hot_labels)

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
# tokenizer.fit_on_texts(texts_train)
total_words = len(tokenizer.word_index) + 1

input_sequences = tokenizer.texts_to_sequences(texts)
padded_input = pad_sequences(input_sequences)

# tags = labels_train
tags = one_hot_labels

# learning_rate = 0.01

# Build LSTM model
model = Sequential()
model.add(Embedding(total_words, output_dim=100, input_length=padded_input.shape[1]))
model.add(LSTM(100)) # 100 neurons
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #Default learning rate is 0.001


# Compile the model with the optimizer, learning rate, and loss function
# optimizer = Adam(learning_rate=learning_rate)
# model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(padded_input, tags, epochs=8, validation_split=0.2)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.src.callbacks.History at 0x79e84c11f460>

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
total_words = len(tokenizer.word_index) + 1

input_sequences = tokenizer.texts_to_sequences(texts)
padded_input = pad_sequences(input_sequences)

# tags = labels_train
tags = one_hot_labels

# Build BiLSTM model
model = Sequential()
model.add(Embedding(total_words, output_dim=100, input_length=padded_input.shape[1]))
model.add(Bidirectional(LSTM(100)))  # BiLSTM layer with 100 neurons
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(padded_input, tags, epochs=8, validation_split=0.2)


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.src.callbacks.History at 0x79e863953b20>