In [2]:
import os
import json
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Load the training data
with open('/content/drive/MyDrive/train.jsonl', 'r', encoding='utf-8') as file:
    train_data = [json.loads(line) for line in file]

# Load the validation data
with open('/content/drive/MyDrive/validation.jsonl', 'r', encoding='utf-8') as file:
    val_data = [json.loads(line) for line in file]

# Separate features, labels, and spoiler types for training data
X_train = [{'postText': entry['postText'][0], 'targetTitle': ' '.join(entry['targetTitle'])} for entry in train_data]
y_train = [entry['tags'][0] for entry in train_data]  # Assuming each entry has a single tag

# Separate features, labels, and spoiler types for validation data
X_val = [{'postText': entry['postText'][0], 'targetTitle': ' '.join(entry['targetTitle'])} for entry in val_data]
y_val = [entry['tags'][0] for entry in val_data]  # Assuming each entry has a single tag

# Continue with the rest of the code...
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the training data
X_train_encoded = [
    {
        'input_ids': tokenizer(entry['postText'] + ' ' + entry['targetTitle'], padding='max_length', truncation=False, return_tensors='pt')['input_ids'],
        'attention_mask': tokenizer(entry['postText'] + ' ' + entry['targetTitle'], padding='max_length', truncation=False, return_tensors='pt')['attention_mask']
    }
    for entry in X_train
]

# Tokenize and encode the validation data
X_val_encoded = [
    {
        'input_ids': tokenizer(entry['postText'] + ' ' + entry['targetTitle'], padding='max_length', truncation=False, return_tensors='pt')['input_ids'],
        'attention_mask': tokenizer(entry['postText'] + ' ' + entry['targetTitle'], padding='max_length', truncation=False, return_tensors='pt')['attention_mask']
    }
    for entry in X_val
]



tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
# Extract BERT features
bert_model.eval()
with torch.no_grad():
    train_features = [
        bert_model(input_ids=entry['input_ids'], attention_mask=entry['attention_mask']).last_hidden_state.mean(dim=1)
        for entry in X_train_encoded
    ]

    val_features = [
        bert_model(input_ids=entry['input_ids'], attention_mask=entry['attention_mask']).last_hidden_state.mean(dim=1)
        for entry in X_val_encoded
    ]

# Convert features to tensor
train_features_tensor = torch.stack(train_features)
val_features_tensor = torch.stack(val_features)

In [4]:
# Define LSTM model
class LSTMClassifier(nn.Module):
    def _init_(self, input_size, hidden_size, output_size, dropout_rate=0.5):
        super(LSTMClassifier, self)._init_()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out)
        return output
clip_value = 0.0001
# Define labels
y_train_tensor = torch.tensor([{'phrase': 0, 'passage': 1, 'multi': 2}[tag] for tag in y_train])
y_val_tensor = torch.tensor([{'phrase': 0, 'passage': 1, 'multi': 2}[tag] for tag in y_val])

# Initialize LSTM model
input_size = bert_model.config.hidden_size
hidden_size = 200
output_size = 3  # 3 classes: phrase, passage, multi
lstm_model = LSTMClassifier(input_size, hidden_size, output_size)

# Train the LSTM model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Convert features to tensor
train_features_tensor = torch.stack(train_features)
val_features_tensor = torch.stack(val_features)

# Train the LSTM model
num_epochs = 650
for epoch in range(num_epochs):
    lstm_model.train()
    optimizer.zero_grad()
    outputs = lstm_model(train_features_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(lstm_model.parameters(), max_norm=clip_value)
    optimizer.step()

    # Evaluate on the validation set
    lstm_model.eval()
    with torch.no_grad():
        val_outputs = lstm_model(val_features_tensor)
        _, predicted_labels = torch.max(val_outputs, 1)

        # Calculate and print accuracy and precision after each epoch
        accuracy = accuracy_score(y_val_tensor, predicted_labels.numpy())
        precision = precision_score(y_val_tensor, predicted_labels.numpy(), average='weighted')

        print(f'Epoch {epoch + 1}/{num_epochs}:')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Precision: {precision:.4f}')

# Save the LSTM model
torch.save(lstm_model.state_dict(), 'lstm_model.pth')
print("Training complete.")

TypeError: ignored

TASK 2

In [None]:
mport torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json

# Load the data
with open('train.jsonl', 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

# Set up T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Function to generate spoilers using T5 model
def generate_spoiler(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    output = model.generate(input_ids, max_length=150, num_beams=2, length_penalty=0.8, early_stopping=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generate spoilers and print the results
for entry in data:
    uuid = entry['uuid']
    clickbait_post = entry['postText'][0]
    linked_document = '\n'.join(entry['targetParagraphs'])
    prompt = f"Clickbait post: {clickbait_post}\nLinked document: {linked_document}"
    spoiler = generate_spoiler(prompt)

    output = {
        "uuid": uuid,
        "spoiler": spoiler
    }

    print(json.dumps(output))

alternate task 2

In [None]:
import openai
import json

# Set your OpenAI API key
openai.api_key = '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'

# Load the data
with open('train.jsonl', 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

# Function to generate spoilers using OpenAI GPT-3.5
def generate_spoiler(prompt):
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=150  # You can adjust this based on your requirements
    )
    return response.choices[0].text.strip()

# Generate spoilers and print the results
for entry in data:
    uuid = entry['uuid']
    clickbait_post = entry['postText'][0]
    linked_document = '\n'.join(entry['targetParagraphs'])
    spoiler = generate_spoiler(f"Clickbait post: {clickbait_post}\nLinked document: {linked_document}")

    output = {
        "uuid": uuid,
        "spoiler": spoiler
    }

    print(json.dumps(output))