In [24]:
#读入数据
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from matplotlib import pyplot as plt

trainData = pd.read_csv("./nlp-getting-started-data/train.csv")
trainData['text'] = trainData['text'].fillna('')

trainKeyword=trainData.values[:,1]
trainLocation=trainData.values[:,2]
trainText=trainData.values[:,3]
trainTarget=trainData.values[:,4]

testData = pd.read_csv("./nlp-getting-started-data/test.csv")
testData['text'] = testData['text'].fillna('')

testKeyword=testData.values[:,1]
testLocation=testData.values[:,2]
testText=testData.values[:,3]

In [25]:
'''
import seaborn as sns #一个Matplotlib高级数据可视化库，画统计图表

trainData['targetMean']=trainData.groupby('keyword')['target'].transform('mean')
fig = plt.figure(figsize=(8,27))
sns.countplot(y=trainData.sort_values(by='targetMean',ascending=False)['keyword'], hue=trainData.sort_values(by='targetMean',ascending=False)['target']) #按照targetMean排序，默认升序，ascending=False改为降序排序

plt.legend(loc=1)

plt.show()

trainData.drop(columns=['targetMean'], inplace=True) #inplace=True的时候在原来的数据上发生改变，默认为inplace=False不改变原数据
'''

"\nimport seaborn as sns #一个Matplotlib高级数据可视化库，画统计图表\n\ntrainData['targetMean']=trainData.groupby('keyword')['target'].transform('mean')\nfig = plt.figure(figsize=(8,27))\nsns.countplot(y=trainData.sort_values(by='targetMean',ascending=False)['keyword'], hue=trainData.sort_values(by='targetMean',ascending=False)['target']) #按照targetMean排序，默认升序，ascending=False改为降序排序\n\nplt.legend(loc=1)\n\nplt.show()\n\ntrainData.drop(columns=['targetMean'], inplace=True) #inplace=True的时候在原来的数据上发生改变，默认为inplace=False不改变原数据\n"

In [26]:
#Tokenizer分词器
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    cache_dir='./myModels', #下载的模型储存的位置
)


In [27]:
from torch.utils.data import Dataset,DataLoader
class TweetDataset(Dataset):
    def __init__(self, texts, labels=None, max_len=256): #初始化。
        self.texts=texts
        self.labels=labels
        self.max_len=max_len
    def __len__(self): #获取包含的样本数
        return len(self.texts)
    def __getitem__(self, idx): #返回索引idx处的数据集样本
        text = str(self.texts[idx])
        encoding = tokenizer.encode_plus( #用于文本编码，会返回：1.input_ids:token ID序列(包括[CLS](通常表示句子或文档开头，对应输入文本第一个词向量)和[SEP](对应最后一个词向量，分割不同句子，句子间用来表示分界点)) 2.token_type_ids句子区分标记 3.attention_mask指示有效token(为1)和无效token(为0)
            text, #文本
            add_special_tokens=True, #是否添加[CLS]和[SEP]
            max_length=self.max_len, #最大序列长度
            padding='max_length', #是否填充
            truncation=True, #是否截断
            return_attention_mask=True, #是否返回attention_mask
            return_tensors='pt', #返回tf(Tensorflow的Tensor)、pt(PyTorch的)或np(Numpy的ndarray)
        )
        input_ids = encoding['input_ids'].flatten() #.flatten()用于将多维数据降为一位数据
        attention_mask = encoding['attention_mask'].flatten()
        
        if len(input_ids) > self.max_len:
            input_ids = input_ids[:self.max_len]
            attention_mask = attention_mask[:self.max_len]
        elif len(input_ids) < self.max_len:
            # 手动填充
            pad_length = self.max_len - len(input_ids)
            input_ids = torch.cat([
                input_ids, 
                torch.full((pad_length,), tokenizer.pad_token_id, dtype=torch.long)
            ])
            attention_mask = torch.cat([
                attention_mask, 
                torch.zeros(pad_length, dtype=torch.long)
            ])
        
        # 添加长度检查
        assert len(input_ids) == self.max_len, f"input_ids length {len(input_ids)} != {self.max_len}"
        assert len(attention_mask) == self.max_len, f"attention_mask length {len(attention_mask)} != {self.max_len}"
        
        if self.labels is not None: #如果有labels则是训练集
            labels = torch.tensor(self.labels[idx], dtype=torch.long)
            return {
                'input_ids':input_ids,
                'attention_mask':attention_mask,
                'labels':labels
            }
        else: #没有labels则是测试集
            return {
                'input_ids':input_ids,
                'attention_mask':attention_mask
            }
        

In [28]:
#构造数据集，训练集和验证集
from sklearn.model_selection import train_test_split
trainDataList, testDataList, trainClassList, testClassList = train_test_split(trainData['text'], trainData['target'], test_size=0.98, random_state=233) #划分训练集和验证集，由于训练的有点慢所以test_size较大先试一下

trainDataset = TweetDataset(trainDataList.tolist(), trainClassList.tolist()) #.tolist()可以将任意形状的Tensor转换为Python原生标量
#当下标为8的数据划分到testClassList的话，这里运行print(trainClassList[8])会报错，这是以你为trainClassList 是一个 pandas Series，其索引可能不是连续的，不存在索引为8的数据
testDataset = TweetDataset(testDataList.tolist(), testClassList.tolist())
BATCH_SIZE=16
trainLoader = DataLoader(trainDataset, batch_size = BATCH_SIZE, shuffle = True)
testLoader = DataLoader(testDataset, batch_size = BATCH_SIZE)


In [None]:
#加载BERT模型
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained( #BERT模型的序列分类版本
    'bert-base-uncased', #BERT基础版本
    cache_dir='./myModels', #下载的模型储存的位置,不设置默认会在用户文件夹下/.cache/torch/transformers里
    num_labels = 2, #二分类
    output_attentions = False, #不返回注意力权重矩阵
    output_hidden_states = False #不返回所有隐藏层的输出，进返回最后一层
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('当前device为' + str(device))
model=model.to(device)

In [None]:
#循环训练
from transformers import AdamW
epochs = 3 #迭代次数
optimizer = AdamW(model.parameters(), lr=2e-5)


for epoch in range(epochs):
    model.train() #训练模式
    i=0
    for batch in trainLoader:
        print('epoch is ' + str(epoch) + ' i is ' + str(i))
        i=i+1
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
    
        optimizer.zero_grad() #清空模型参数梯度
        loss = outputs[0]
        loss.backward() #计算损失函数对模型参数的梯度
        optimizer.step() #更新模型参数

epoch is 0i is 0
epoch is 0i is 1
epoch is 0i is 2
epoch is 0i is 3


KeyboardInterrupt: 

In [None]:
model.eval()#预测模式