# 作业五：实战NLP——讽刺检测


| 姓名  |     学号     |
|:---:|:----------:|
| 艾华喜 | 1120222907 |

### 要求：

<font color=Red>完成以下notebook，Sarcasm Detection数据集的下载代码已经给出，请同学们自行完成数据处理和训练过程。作业提交 jupyter notebook 文件。</font>

近年来，以社交媒体为媒介的电子新闻已成为信息消费的主要来源之一。许多媒体公司正在使用创造性的方法来增加帖子的浏览量。其中一种方法是使用讽刺标题作为用户点击的诱饵。

一个能够预测一篇新闻的标题是否具有讽刺意味的模型对于媒体公司来说很有用，可以方便他们通过一些策略分析季度收益。此外，从读者的角度来看，搜索引擎可以利用这些讽刺的信息，并根据读者的偏好，向他们推荐类似的文章。

## 数据集
用于讽刺检测的新闻标题数据集，该数据集来自两个新闻网站，theonion.com和huffingtonpost.com。以往的研究大多使用基于标签监督收集的Twitter数据集，但这些数据集在标签和语言方面存在噪声。此外，许多tweet是对其他tweet的回复，检测其中的讽刺需要上下文tweet的信息。这个新的数据集与现有的Twitter数据集相比有以下优点:
由于新闻标题是由专业人士以正式的方式编写的，所以没有拼写错误和非正式用法。这减少了稀疏性。
此外，由于TheOnion的唯一目的是发布讽刺的新闻，与Twitter数据集相比，标签的质量要更高，噪音小得多。与回复其他推文的推文不同，新闻标题是独立的。这将有助于我们梳理出真正的讽刺元素

## 下载和缓存数据集

In [1]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wget
  Building wheel for wget (setup.py): started
  Building wheel for wget (setup.py): finished with status 'done'
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=f3333ddb6fc5567524467c4aee9caace920dc65dd3a2647945311682924e40ed
  Stored in directory: c:\users\oliver\appdata\local\pip\cache\wheels\ba\78\fb\e0c24a9e73d7483b073d15b7e05f43f3fc2ac75eff6899c7aa
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
import wget
import os

target_dir = '../data/'
target_filename = 'Sarcasm_News_Headline.json'

if not os.path.exists(target_dir):
    os.makedirs(target_dir)

url = 'https://huggingface.co/datasets/raquiba/Sarcasm_News_Headline/resolve/main/test.json?download=true'
downloaded_file_path = wget.download(url, target_dir)

new_file_path = os.path.join(target_dir, target_filename)
os.rename(downloaded_file_path, new_file_path)

## 读取并查看数据集

In [3]:
import json

data_raw = [json.loads(line) for 
        line in open(new_file_path, 'r')]

In [4]:
print(len(data_raw))
print(data_raw[0])

26709
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}


可以看到数据集一共包含了26709条新闻标题以及对应的标签。**这里先忽略数据集里的'article_link'属性**

数据集的 **'headline'** 给出的是新闻标题，而 **'is_sarcastic'** 给出的是该新闻标题是否是讽刺性的标签。

下面看一下数据集里所有 **'headline'** 的长度统计数据。

In [5]:
max_length, min_length = 0, 0x3f3f3f
sum_length = 0
length_distribute = [0] * 1000
for i in range(len(data_raw)):
    l = len(data_raw[i]['headline'])
    sum_length += l
    max_length = max(max_length, l)
    min_length = min(min_length, l)
    length_distribute[l] += 1
        
avg = sum_length / len(data_raw)
print(f'max length: {max_length} \nmin length: {min_length} \navg length: {avg}')

print(length_distribute[:max_length + 1])

max length: 254 
min length: 7 
avg length: 60.910591935302705
[0, 0, 0, 0, 0, 0, 0, 1, 4, 4, 2, 13, 10, 19, 38, 30, 33, 43, 52, 49, 63, 68, 74, 69, 91, 117, 93, 131, 152, 146, 164, 170, 202, 210, 216, 255, 228, 235, 250, 315, 331, 316, 371, 356, 364, 370, 420, 403, 435, 436, 475, 497, 488, 460, 490, 502, 553, 520, 543, 530, 550, 577, 645, 581, 600, 673, 575, 575, 532, 570, 532, 529, 457, 497, 431, 439, 423, 387, 353, 338, 312, 254, 292, 256, 250, 213, 210, 194, 185, 155, 146, 141, 122, 113, 106, 89, 91, 71, 67, 66, 67, 59, 50, 49, 43, 33, 39, 33, 21, 22, 29, 24, 23, 31, 12, 16, 17, 15, 8, 13, 7, 8, 7, 9, 9, 6, 3, 7, 5, 2, 2, 5, 1, 2, 1, 0, 1, 4, 1, 3, 0, 2, 1, 4, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

可以看到最长的标题到达了254个单词之多，而最短只有7个单词。平均长度为60。

# <font color=Red>数据处理（自行完成）<font>



请同学自己完成数据处理过程。
    
<font color=Red> 要求： <font> 
    
**26709条新闻的前20000个作为训练集，后6709条作为测试集，不设验证集。** 最后在测试集上测试自己模型的最终结果。

In [None]:
import json
import os
import torch
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import numpy as np
import warnings

# 忽略警告
warnings.filterwarnings('ignore')

# 目标目录和文件名
target_dir = '../data/'
target_filename = 'Sarcasm_News_Headline.json'

# 定义用于讽刺检测的Dataset类
class SarcasmDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length):
        self.sentences = sentences  # 保存句子
        self.labels = labels  # 保存标签
        self.tokenizer = tokenizer  # 保存分词器
        self.max_length = max_length  # 保存最大长度

    def __len__(self):
        return len(self.sentences)  # 返回数据集的长度

    def __getitem__(self, idx):
        # 获取句子和标签
        sentence = self.sentences[idx]
        label = self.labels[idx]

        # 编码句子
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),  # 返回输入ID
            'attention_mask': encoding['attention_mask'].flatten(),  # 返回注意力掩码
            'labels': torch.tensor(label, dtype=torch.long)  # 返回标签
        }

# 加载数据集
def load_dataset(file_path):
    # 读取文件并解析为JSON对象
    data_raw = [json.loads(line) for line in open(file_path, 'r')]
    # 提取句子和标签
    sentences = [item['headline'] for item in data_raw]
    labels = [item['is_sarcastic'] for item in data_raw]
    return sentences, labels

# 编码单个句子
def encode_sentence(sentence):
    inputs = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=254,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return inputs['input_ids'], inputs['attention_mask']

# 处理数据，将句子编码为BERT嵌入向量
def process_data(sentences, labels, model):
    x = []  # 保存句子嵌入
    for i, sentence in enumerate(sentences):
        print(i)
        input_ids, attention_mask = encode_sentence(sentence)
        with torch.no_grad():  # 禁用梯度计算
            outputs = model(input_ids, attention_mask=attention_mask)
        # 获取句子的平均嵌入
        sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
        x.append(sentence_embedding)
    data = {
        'X': x,  # 保存句子嵌入
        'Y': labels  # 保存标签
    }
    np.save("train_data.npy", data)  # 保存为.npy文件

# 预训练BERT模型名称
model_name = '../bert-base-uncased'

# 加载训练数据集
train_sentences, train_labels = load_dataset(os.path.join(target_dir, target_filename))

# 加载BERT分词器
tokenizer = BertTokenizer.from_pretrained(model_name)

# 创建数据集和数据加载器
train_dataset = SarcasmDataset(train_sentences, train_labels, tokenizer, max_length=254)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# 加载BERT模型
model = BertModel.from_pretrained(model_name)

# 处理数据，生成句子嵌入
process_data(train_sentences, train_labels, model)


# <font color=Red>训练（自行完成）<font>

请同学自己完成从**定义网络、定义损失函数、定义优化器到进行训练等一系列深度学习流水线**。

In [2]:
import random
import torch.nn.functional as F
import torch
from torch import nn
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def date_iter(batch_size,X,Y):
    num_examples = len(X)
    indices = list(range(num_examples))
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(
            indices[i: min(i + batch_size, num_examples)])
        yield X[batch_indices], Y[batch_indices]


class LogisticRegression(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LogisticRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return torch.sigmoid(x)

def train(num_epochs):
    loaded_data = np.load('train_data.npy', allow_pickle=True)
    trainX = loaded_data.item().get('X')
    trainY = loaded_data.item().get('Y')
    train_X = torch.stack(trainX)[:20000]
    train_Y = torch.tensor(trainY)[:20000]

    for epoch in range(num_epochs):
        batch_count = 0  # 初始化batch计数器

        total_loss = 0
        for x, y in date_iter(batch_size, train_X, train_Y):
            x,y=x.float(),y.float()
            x,y = x.to(device), y.to(device)
            y=y.unsqueeze(1)
            batch_count += 1  # 每次迭代时增加计数
            updater.zero_grad()
            y_hat = logistic(x)
            l = loss(y_hat, y)
            total_loss += l.item()
            l.backward()
            updater.step()
        print(f'epoch: {epoch}, loss: {total_loss / batch_count}' )
    torch.save(logistic.state_dict(), 'model.pth')  # 保存模型的参数


input_size = 768  # 输入向量的长度
hidden_size = 256 # 隐藏层的大小
output_size = 1  # 输出大小，二分类问题只需输出一个值
logistic = LogisticRegression(input_size, hidden_size, output_size)
batch_size = 256
num_epochs = 40
lr = 0.01
loss = nn.BCELoss()

updater = torch.optim.Adam(logistic.parameters(), lr=lr)
device='cuda' if torch.cuda.is_available() else 'cpu'
logistic.to(device)


train(num_epochs)



epoch: 0, loss: 0.4133866063401669
epoch: 1, loss: 0.3038702858022497
epoch: 2, loss: 0.27173037317734733
epoch: 3, loss: 0.2500445742773104
epoch: 4, loss: 0.233459790673437
epoch: 5, loss: 0.2215737034625645
epoch: 6, loss: 0.20715269484097446
epoch: 7, loss: 0.19154358890992176
epoch: 8, loss: 0.17954917195477063
epoch: 9, loss: 0.17692906334053113
epoch: 10, loss: 0.15234877291736723
epoch: 11, loss: 0.1409556182879436
epoch: 12, loss: 0.131787016610556
epoch: 13, loss: 0.11256198962278004
epoch: 14, loss: 0.09802745565583434
epoch: 15, loss: 0.08877636001834387
epoch: 16, loss: 0.08167257617357411
epoch: 17, loss: 0.06800768175457098
epoch: 18, loss: 0.06177805189656306
epoch: 19, loss: 0.0658184605403037
epoch: 20, loss: 0.049643897581138186
epoch: 21, loss: 0.03725508263311054
epoch: 22, loss: 0.03403293209362634
epoch: 23, loss: 0.028151061397658873
epoch: 24, loss: 0.023568490803053108
epoch: 25, loss: 0.02060372084143418
epoch: 26, loss: 0.025778521406405335
epoch: 27, loss: 

# <font color=Red>测试<font>


In [3]:
import random
import torch.nn.functional as F
import torch
from torch import nn
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def date_iter(batch_size,X,Y):
    num_examples = len(X)
    indices = list(range(num_examples))
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(
            indices[i: min(i + batch_size, num_examples)])
        yield X[batch_indices], Y[batch_indices]

class Accumulator:
    """用于累加 `n` 个变量的累加器。"""
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
class LogisticRegression(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LogisticRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return torch.sigmoid(x)

def test():
    loaded_data = np.load('train_data.npy', allow_pickle=True)
    trainX = loaded_data.item().get('X')
    trainY = loaded_data.item().get('Y')
    train_X = torch.stack(trainX)[20000:]
    train_Y = torch.tensor(trainY)[20000:]
    metric = Accumulator(2)


    for x, y in date_iter(batch_size, train_X, train_Y):

            x,y=x.float(),y.float()
            x,y = x.to(device), y.to(device)
            y=y.unsqueeze(1)
            y_hat = logistic(x)
            y_hat = (y_hat > 0.5).float()
            same_elements = torch.sum(y == y_hat).item()
            metric.add(same_elements, y.size(0))

    print(f'accuracy{metric[0] / metric[1]:.10f}' )


input_size = 768  # 输入向量的长度
hidden_size = 256 # 隐藏层的大小
output_size = 1  # 输出大小，二分类问题只需输出一个值
logistic = LogisticRegression(input_size, hidden_size, output_size)
logistic.load_state_dict(torch.load('model.pth'))
batch_size = 256
num_epochs = 40
lr = 0.01
loss = nn.BCELoss()

device='cuda' if torch.cuda.is_available() else 'cpu'
logistic.to(device)


test()



accuracy0.8806081383


## 提交方式
<font color=Red>包含训练结果的Jupyter notebook文件请命名为 `work2_<组长姓名>_<组长学号>.ipynb` 发送到邮箱 archie98@qq.com</font>