In [1]:
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from bert4torch.snippets import sequence_padding, text_segmentate, ListDataset
from bert4torch.snippets import seed_everything, Callback, get_pool_emb
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# 配置

In [2]:
# 固定seed
seed_everything(42)
maxlen = 30
batch_size = 16
pretrained_dir = './'
config_path = pretrained_dir+'bert_config.json'
checkpoint_path = pretrained_dir+'pytorch_model.bin'
dict_path = pretrained_dir+'vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

[0;32m[INFO][0m Global seed set to 42


# 数据处理

In [3]:
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 加载数据集
class MyDataset(ListDataset):
    @staticmethod
    def load_data(filenames):
        """加载数据，并尽量划分为不超过maxlen的句子
        """
        D = []
        seps, strips = u'\n。！？!?；;，, ', u'；;，, '
        for filename in filenames:
            with open(filename, encoding='utf-8') as f:
                for l in f:
                    text, label = l.strip().split('\t')
                    for t in text_segmentate(text, maxlen - 2, seps, strips):
                        D.append((t, int(label)))
        return D
    
def collate_fn(batch):
    batch_token_ids, batch_segment_ids, batch_labels = [], [], []
    for text, label in batch:
        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
        batch_token_ids.append(token_ids)
        batch_segment_ids.append(segment_ids)
        batch_labels.append([label])

    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
    batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
    batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
    return [batch_token_ids, batch_segment_ids], batch_labels.flatten()

# 双阶段训练

In [4]:
# 定义bert上的模型结构
class Model(BaseModel):
    def __init__(self) -> None:
        super().__init__()
        self.bert = build_transformer_model(config_path=config_path, 
                                            checkpoint_path=checkpoint_path, 
                                            with_pool=True)
        self.dropout = nn.Dropout(0.1)
        self.dense = nn.Linear(self.bert.configs['hidden_size'], 6)
        
    def forward(self, token_ids, segment_ids):
        _, pooled_output = self.bert([token_ids, segment_ids])
        output = self.dropout(pooled_output)
        output = self.dense(output)
        return output
model = Model().to(device)

# 定义使用的loss和optimizer，这里支持自定义
model.compile(
    loss=nn.CrossEntropyLoss(),
    optimizer=optim.Adam(model.parameters(), lr=2e-5),
)

class Evaluator(Callback):
    """评估与保存
    """
    def __init__(self):
        self.best_val_acc = 0.

    def on_epoch_end(self, global_step, epoch, logs=None):
        val_acc = self.evaluate(valid_dataloader)
        if val_acc >= self.best_val_acc:
            self.best_val_acc = val_acc
            model.save_weights('./best_model_1.pt')
        print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')

    # 定义评价函数
    def evaluate(self, data):
        total, right = 0., 0.
        for x_true, y_true in data:
            y_pred = model.predict(x_true).argmax(axis=1)
            total += len(y_true)
            right += (y_true == y_pred).sum().item()
        acc = right / total
        return acc

def inference(texts):
    '''单条样本推理
    '''
    ans = []
    for text in texts:
        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
        token_ids = torch.tensor(token_ids, dtype=torch.long, device=device)[None, :]
        segment_ids = torch.tensor(segment_ids, dtype=torch.long, device=device)[None, :]

        logit = model.predict([token_ids, segment_ids])
        y_pred = torch.argmax(torch.softmax(logit, dim=-1)).cpu().numpy()
        ans.append(y_pred)
    return ans

def predict(file_path,output_path):
    f = open(file_path, 'r', encoding='utf-8')
    test_datas = f.readlines()
    test_datas = [data.split('\t')[0] for data in test_datas]
    results = inference(test_datas)
    f.close()

    fw = open(output_path, 'w', encoding='utf-8')
    for i in range(len(test_datas)):
        fw.write(f"{test_datas[i]}\t{results[i]}\n")
    fw.close()

## 第一阶段的训练
- 做7折交叉验证

In [5]:
state = [1, 42, 100, 142, 500, 1200, 2023]
for i in range(7):
    src_path = "../data/train1.txt"
    df1 = pd.read_table(src_path, sep="\t", header=None)
    shuffled_df1 = df1.sample(frac=1, random_state=state[i])
    shuffled_df1.reset_index(drop=True, inplace=True)
    
    train_df = shuffled_df1[:6500]
    dev_df = shuffled_df1[6500:]
    dev_df.reset_index(drop=True, inplace=True)
    
    train_df.to_csv("../data/one_times_data/train/" + str(i) + ".txt", sep="\t",
                    index=False, header = None)
    dev_df.to_csv("../data/one_times_data/dev/" + str(i) + ".txt", sep="\t",
                    index=False, header = None)

In [None]:
for i in range(7):
    train_dataloader = DataLoader(MyDataset(['../data/one_times_data/train/' + str(i) + ".txt"]), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    valid_dataloader = DataLoader(MyDataset(['../data/one_times_data/dev/' + str(i) + ".txt"]), batch_size=batch_size, collate_fn=collate_fn)
    evaluator = Evaluator()
    model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
    predict('../data/pred.txt', '../output/one_times/' + str(i) + ".txt")

2023-10-09 08:41:37 - Start Training

2023-10-09 08:41:37 - Epoch: 1/10
val_acc: 0.85600, best_val_acc: 0.85600


2023-10-09 08:42:44 - Epoch: 2/10
val_acc: 0.86600, best_val_acc: 0.86600


2023-10-09 08:43:49 - Epoch: 3/10
val_acc: 0.86600, best_val_acc: 0.86600


2023-10-09 08:44:52 - Epoch: 4/10
 19/407 [>.............................] - ETA: 52s - loss: 0.1906 

## 第二阶段的训练样本

### 根据第一阶段的预测结果 单独提取预测为0的组成新的预测集

In [4]:
sub_path = "../one_times/"
net_data_index = []
for i in range(7):
    df = pd.read_csv(sub_path + str(i) + ".txt", sep="\t", header= None, )
    data_index = []
    net_data = []
    for j in range(len(df)):
        net_data.append(df[0][j])
        data_index.append(df[1][j])
    net_data_index.append(data_index)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 2, saw 2


In [48]:
# 对每个位置的数字进行投票
votes = []
for i in range(len(data_index)):
    current_votes = [lst[i] for lst in net_data_index]
    counter = Counter(current_votes)
    winning_number = counter.most_common(1)[0][0]  # 默认是如果全部不相同取第一个
    votes.append(winning_number)
# 输出最终的投票结果
print("最终投票结果：", votes)

最终投票结果： [0, 3, 3, 2, 3, 0, 3, 1, 3, 3, 1, 2, 0, 0, 2, 2, 2, 0, 1, 2, 0, 3, 3, 3, 3, 2, 3, 1, 3, 0, 1, 3, 0, 2, 3, 0, 3, 3, 1, 1, 0, 3, 0, 0, 2, 1, 2, 3, 0, 2, 2, 0, 3, 0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 2, 2, 3, 1, 3, 2, 1, 0, 2, 1, 0, 3, 2, 0, 2, 1, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 3, 1, 0, 2, 2, 2, 2, 0, 3, 3, 0, 0, 1, 1, 3, 1, 1, 1, 0, 1, 1, 0, 2, 0, 0, 1, 1, 3, 2, 0, 1, 3, 2, 0, 0, 1, 3, 3, 3, 2, 3, 1, 1, 2, 3, 0, 0, 1, 1, 2, 0, 0, 1, 2, 0, 2, 0, 2, 1, 0, 0, 0, 3, 2, 2, 3, 0, 0, 2, 3, 3, 1, 0, 0, 0, 2, 2, 3, 2, 3, 1, 1, 0, 1, 3, 3, 1, 0, 3, 1, 2, 3, 0, 1, 0, 0, 0, 2, 0, 3, 3, 3, 0, 3, 3, 2, 0, 2, 1, 1, 0, 3, 1, 0, 0, 3, 3, 3, 0, 1, 3, 2, 1, 1, 2, 3, 1, 2, 0, 2, 0, 2, 0, 0, 0, 1, 0, 1, 3, 0, 1, 1, 1, 1, 2, 3, 2, 1, 3, 0, 2, 3, 3, 3, 0, 1, 1, 0, 3, 1, 3, 3, 0, 0, 1, 3, 3, 0, 1, 2, 0, 3, 0, 1, 0, 2, 2, 1, 0, 3, 0, 3, 2, 3, 1, 1, 2, 0, 1, 1, 3, 0, 0, 2, 3, 2, 2, 0, 2, 3, 3, 0, 2, 0, 1, 0, 3, 1, 1, 0, 1, 0, 2, 1, 3, 2, 2, 1, 2, 0, 3, 1, 1, 1, 2, 3, 2, 0, 0, 3, 0, 0, 1, 2, 2, 3, 3

In [50]:
one_time = [net_data, votes]
pd.DataFrame(one_time).T.to_csv("submit1.txt" , sep="\t", index=False, header = None)

In [51]:
df = pd.read_table("submit1.txt", sep="\t", header= None)
print(pd.DataFrame(df[1]).value_counts())
data = []
for i in range(len(df)):
    if df[1][i] == 0:
        data.append(df.iloc[i])
print(pd.DataFrame(data)[1].value_counts())
pd.DataFrame(data).to_csv("../data/pred2.txt", sep="\t", index=False, header = None)

1
0    277
1    260
3    251
2    212
Name: count, dtype: int64
1
0    277
Name: count, dtype: int64


### 第二阶段训练
- 交叉验证

In [None]:
class Evaluator(Callback):
    """评估与保存
    """
    def __init__(self):
        self.best_val_acc = 0.

    def on_epoch_end(self, global_step, epoch, logs=None):
        val_acc = self.evaluate(valid_dataloader)
        if val_acc >= self.best_val_acc:
            self.best_val_acc = val_acc
            model.save_weights('./best_model_2.pt')
        print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')

    # 定义评价函数
    def evaluate(self, data):
        total, right = 0., 0.
        for x_true, y_true in data:
            y_pred = model.predict(x_true).argmax(axis=1)
            total += len(y_true)
            right += (y_true == y_pred).sum().item()
        acc = right / total
        return acc

In [56]:
state = [1, 42, 100, 142, 500, 1200, 2023]
for i in range(7):
    src_path = "../data/train2.txt"
    df1 = pd.read_table(src_path, sep="\t", header=None)
    shuffled_df1 = df1.sample(frac=1, random_state=state[i])
    shuffled_df1.reset_index(drop=True, inplace=True)
    
    train_df = shuffled_df1[:1700]
    dev_df = shuffled_df1[1700:]
    dev_df.reset_index(drop=True, inplace=True)
    
    train_df.to_csv("../data/two_times_data/train/" + str(i) + ".txt", sep="\t",
                    index=False, header = None)
    dev_df.to_csv("../data/two_times_data/dev/" + str(i) + ".txt", sep="\t",
                    index=False, header = None)

In [None]:
batch_size = 8
for i in range(7):
    train_dataloader = DataLoader(MyDataset(['../data/two_times_data/train/' + str(i) + ".txt"]), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    valid_dataloader = DataLoader(MyDataset(['../data/two_times_data/dev/' + str(i) + ".txt"]), batch_size=batch_size, collate_fn=collate_fn)
    evaluator = Evaluator()
    model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
    predict('../data/pred2.txt', '../output/two_times/' + str(i) + ".txt")

2023-10-09 08:38:06 - Start Training

2023-10-09 08:38:06 - Epoch: 1/10
val_acc: 0.85135, best_val_acc: 0.85135


2023-10-09 08:38:43 - Epoch: 2/10
val_acc: 0.94595, best_val_acc: 0.94595


2023-10-09 08:39:21 - Epoch: 3/10
val_acc: 0.93919, best_val_acc: 0.94595


2023-10-09 08:39:49 - Epoch: 4/10
val_acc: 0.92568, best_val_acc: 0.94595


2023-10-09 08:40:16 - Epoch: 5/10
val_acc: 0.92568, best_val_acc: 0.94595


2023-10-09 08:40:44 - Epoch: 6/10
 31/213 [===>..........................] - ETA: 23s - loss: 0.0072 

### 根据预测结果投票

In [None]:
sub_path = "../output/two_times/"
net_data_index = []
for i in range(7):
    df = pd.read_csv(sub_path + str(i) + ".txt", sep="\t", header= None)
    data_index = []
    net_data = []
    for j in range(len(df)):
        net_data.append(df[0][j])
        data_index.append(df[1][j])
    net_data_index.append(data_index)

In [None]:
# 对每个位置的数字进行投票
votes = []
for i in range(len(data_index)):
    current_votes = [lst[i] for lst in net_data_index]
    counter = Counter(current_votes)
    winning_number = counter.most_common(1)[0][0]
    votes.append(winning_number)
# 输出最终的投票结果
print("最终投票结果：", votes)

In [None]:
two_time = [net_data, votes]
pd.DataFrame(two_time).T.to_csv("submit2.txt" , sep="\t", index=False, header = None)

## 拼接结果

In [27]:
df1 = pd.read_table("submit1.txt", sep="\t", header= None)
print(pd.DataFrame(df1[1]).value_counts())

1
1    277
3    276
0    266
2    181
Name: count, dtype: int64


In [28]:
df2 = pd.read_table("submit2.txt", sep='\t', header=None)

In [29]:
num = 0
for i in range(len(df1)):
    if df1[1][i] == 0:
        print(i, num)
        df1[1][i] = df2[1][num]
        num += 1

0 0
11 1
12 2
13 3
14 4
16 5
29 6
32 7
35 8
42 9
43 10
48 11
51 12
53 13
54 14
59 15
60 16
61 17
62 18
64 19
67 20
73 21
75 22
78 23
82 24
90 25
92 26
93 27
96 28
97 29
101 30
104 31
105 32
112 33
115 34
118 35
123 36
128 37
139 38
140 39
143 40
144 41
145 42
148 43
150 44
151 45
153 46
154 47
155 48
161 49
166 50
168 51
170 52
175 53
176 54
186 55
188 56
189 57
190 58
196 59
200 60
204 61
207 62
208 63
212 64
222 65
224 66
226 67
227 68
228 69
230 70
243 71
251 72
256 73
257 74
261 75
264 76
266 77
268 78
272 79
274 80
281 81
285 82
286 83
287 84
291 85
295 86
297 87
299 88
303 89
305 90
306 91
313 92
318 93
321 94
322 95
324 96
334 97
335 98
336 99
340 100
346 101
348 102
349 103
353 104
354 105
356 106
369 107
382 108
383 109
384 110
385 111
386 112
387 113
393 114
400 115
402 116
411 117
412 118
413 119
414 120
418 121
426 122
428 123
434 124
446 125
450 126
451 127
453 128
459 129
460 130
462 131
463 132
465 133
466 134
469 135
472 136
476 137
483 138
486 139
487 140
489 141
492 1

In [30]:
pd.DataFrame(df).to_csv("../output/submit.txt", sep="\t", index=False, header = None)

In [7]:
df1 = pd.read_table("../output/submit.txt", sep="\t", header= None)

In [9]:
pd.DataFrame(df1[1]).value_counts()

1
1    275
3    267
2    210
4     98
0     87
5     63
Name: count, dtype: int64