In [14]:
import numpy as np
import pandas as pd

import torch
from torch import nn,optim
from torch.utils import data
from torch.cuda import amp
from torchvision import transforms

from sklearn.model_selection import StratifiedGroupKFold

from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary

from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer

import os
import re
import gc #垃圾回收

from tqdm import tqdm #进度条 for data in tqdm(range(100))

# 数据

In [5]:
data_dir = "E:/DATA/feedback-prize-effectiveness/"

In [10]:
df1 = pd.read_csv(data_dir+"train.csv")
print(df1.head(10))
print(df1.iloc[0]['discourse_text'])
#essay_id   txt文件名
#discourse_id   段落的id，应该没用

   discourse_id      essay_id  \
0  0013cc385424  007ACE74B050   
1  9704a709b505  007ACE74B050   
2  c22adee811b6  007ACE74B050   
3  a10d361e54e4  007ACE74B050   
4  db3e453ec4e2  007ACE74B050   
5  36a565e45db7  007ACE74B050   
6  fb65fe816ba3  007ACE74B050   
7  4e472e2584fa  007ACE74B050   
8  28a94d3ee425  007ACE74B050   
9  d226f06362f5  00944C693682   

                                      discourse_text        discourse_type  \
0  Hi, i'm Isaac, i'm going to be writing about h...                  Lead   
1  On my perspective, I think that the face is a ...              Position   
2  I think that the face is a natural landform be...                 Claim   
3  If life was on Mars, we would know by now. The...              Evidence   
4  People thought that the face was formed by ali...          Counterclaim   
5  though some say that life on Mars does exist, ...              Rebuttal   
6  It says in paragraph 7, on April 5, 1998, Mars...              Evidence   
7  Everyone 

In [9]:
df2 = pd.read_csv(data_dir+"test.csv")
print(df2.head())
print(df2.iloc[0]['discourse_text'])

   discourse_id      essay_id  \
0  a261b6e14276  D72CB1C11673   
1  5a88900e7dc1  D72CB1C11673   
2  9790d835736b  D72CB1C11673   
3  75ce6d68b67b  D72CB1C11673   
4  93578d946723  D72CB1C11673   

                                      discourse_text discourse_type  
0  Making choices in life can be very difficult. ...           Lead  
1  Seeking multiple opinions can help a person ma...       Position  
2                     it can decrease stress levels           Claim  
3             a great chance to learn something new           Claim  
4               can be very helpful and beneficial.           Claim  
Making choices in life can be very difficult. People often ask for advice when they can not decide on one thing. It's always good to ask others for their advice when making a choice. When you have multiple opinions you have the ability to make the best choice for yourself. 


In [12]:
with open(data_dir+"/train/000E6DE9E817.txt") as f:
    print(f.read())
print("-------")
with open(data_dir+"/train/00B144412785.txt") as f:
    print(f.read())

Dear: Principal

I am arguing against the policy change because even though there are some children out there that really needs help with their academic work, that does not mean that only because they have a c average that would not let them enjoy their sports or other activities unless they've a B average.

Sometimes teachers or even principal needs to consider that we should give the help that any student should have. Also this may consider student self as steam. Meaning student would start to feel sad nervous, and not wanting to go to school because of the reason they have a low averages and they can not participate in other activities or sports. The fact that there are children that would want to enjoy many good things the school is actually giving it to them.

We would want to make changes as, "like to be a better person for a better tomorrow" This supports the idea of having have many good thoughts and incasing your work as much as possible. In some situation like arguing we shou

# RNN

## 数据基本处理
1. 一般语言处理中对全部数据只会取常用的n个词，在此之外的词是不认识的，即先有一本字典
2. 用one-hot编码时，每个词都表示为长n的向量，其中只有一个值是1，其余全是0。比如字典的第i（0开始）个单词其编码中1值的下标是i

## embedding
1. one_hot编码稀疏，所以考虑用稠密向量表示词，比如只用长为m<n的向量表示n个单词，其中单词间关系可以体现在向量间关系中，比如$\vec{男}+\vec{国王}=\vec{皇帝}$
2. 可由embedding层实现这个，embedding记录了一个(n,m)的矩阵，每行都是一个单词的稠密向量，作用是one-hot编码的向量按照其1值的下标i访问这个矩阵第i行，取出这行向量作为新输入
3. 具体使用看下面代码说明
4. embedding可由自行训练出，也可预加载预训练参数。使用预训练参数时，冻结此层

## 初步结果
将batch_size\*len_sentences\*n变为batch_size\*len_sentences\*m

## rnn具体流程
1. 首先初始化hadden_input为全0
2. 对每个词，其都会和当前的hadden_input一起进入网络(cat或add)进行一步（liner、tanh激活）运算，所得的输出作为新的hadden_input与下一个词的向量一起进入网络（cat或add）
3. cat（最后一维）的话，需要截断，或者另外卷积一次获得新hadden_input
4. pytorch的rnn为x[i]通过一liner，hidden通过一liner，两个结果相加经Tanh激活，结果作为x[i]和新hidden，具体见下面代码

In [3]:
# rnn内部细节
batch_size = 4
sentence_len = 20 #一句话20个词
words_num = 100 #字典记录了100个词
words_len = 8 #字典的每个词向量长8
X = torch.randint(0,words_num,[batch_size,sentence_len]) #注意输入不为one-hot，只是每个词的字典序号，比如[3,2,10]表示一句话。int
y = torch.as_tensor([[0.,1],[0,1],[1,0],[1,0]])
#print(X)

hidden_layer_num = 1 #多少个hidden用于循环，即多少个循环部分
class MyRNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.hidden_size = 128
        self.embedding_layer = nn.Embedding(words_num,words_len,padding_idx=0) #100*8,padding_idx为输入长度不够时填充的字典词序号

        #一个循环节
        self.hidden_layer_x = nn.Linear(words_len,self.hidden_size) #rnn关键部分
        self.hidden_layer_h = nn.Linear(self.hidden_size,self.hidden_size) #给h用

        #分类器
        self.out_layer = nn.Linear(self.hidden_size,2)
        self.activation_layer = nn.Softmax(dim=-1) #dim=0表示a[i][j][k]按i方向的几个数一起算

    def __init_hidden(self):
        return torch.zeros([hidden_layer_num,batch_size,self.hidden_size])

    def forward(self,x):
        self.hidden = self.__init_hidden()

        print(f"before embedding:{x.shape}") #[4, 20]
        x = self.embedding_layer(x)
        print(f"after embedding:{x.shape}") #[4, 20, 8]
        print("-------------")

        out = torch.zeros([sentence_len, batch_size, self.hidden_size])

        #rnn部分
        # 为了更好计算，将数据x变形为为len_sencentces*batch_size*words_num
        # 即x[0]为各句子首单词
        x = x.transpose(0,1)
        print(f"rnn input:{x.shape}") #[20, 4, 128]
        for i in range(x.shape[0]):
            a1 = self.hidden_layer_x(x[i])
            a2 = self.hidden_layer_h(self.hidden[0])

            out[i] = self.hidden[0] = nn.Tanh()(a1+a2)
        print(f"rnn out:{out.shape}")
        print(f"rnn outh:{self.hidden[0].shape}")
        print("-----------")
        
        #分类器
        o = self.hidden[0]
        o = self.out_layer(o)
        o = self.activation_layer(o)
        print(f"finally shape:{o.shape}") #[4, 2]
        print(o)

        return o

rnn = MyRNN()
out = rnn(X)

before embedding:torch.Size([4, 20])
after embedding:torch.Size([4, 20, 8])
-------------
rnn input:torch.Size([20, 4, 8])
rnn out:torch.Size([20, 4, 128])
rnn outh:torch.Size([4, 128])
-----------
finally shape:torch.Size([4, 2])
tensor([[0.5058, 0.4942],
        [0.5616, 0.4384],
        [0.4322, 0.5678],
        [0.4265, 0.5735]], grad_fn=<SoftmaxBackward0>)


## pytorch的rnn
- 首先就是输入是batch_size\*len_sencentces\*words_num  
所以Embedding算是预处理部分，如果需要训练则？？？？？？？？
- 主要公式$$h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})$$

In [9]:
batch_size = 2
sentence_len = 5
words_num = 10
words_len = 8

X = torch.randint(0,words_num,[batch_size,sentence_len])

X = nn.Embedding(words_num,words_len)(X)
#X = X.transpose(0,1) #batch_size放在第二维，则batch_first设置为False
print(X.shape)

hidden_size=32
num_layers=1
H = torch.zeros([num_layers,batch_size,hidden_size]) #可以不设置，则默认为0，这是单向rnn
#HH = torch.zeros([num_layers*2,batch_size,hidden_size]) #双向rnn，需要RNN中设置bidirectional=True

rnn = nn.RNN(
            input_size=words_len,

            #hidden_size，num_layers 都是对网络的设置，与输入数据无关，设置相对自由
            hidden_size=hidden_size,     # hidden层大小
            num_layers=num_layers,       # n个rnn层

            batch_first=True, #True则输入输出的batch在第一维，否则在第二维（参照上面MyRNN在hidden前的变形）

            bidirectional=False, #是否双向rnn
        )

out,outh = rnn(X,H) #out为h的集合
print(out.shape,outh.shape)

torch.Size([2, 5, 8])
torch.Size([2, 5, 32]) torch.Size([1, 2, 32])


In [10]:
for i in rnn.named_parameters():
    print(i[0],i[1].shape,type(i[1]))

weight_ih_l0 torch.Size([32, 8]) <class 'torch.nn.parameter.Parameter'>
weight_hh_l0 torch.Size([32, 32]) <class 'torch.nn.parameter.Parameter'>
bias_ih_l0 torch.Size([32]) <class 'torch.nn.parameter.Parameter'>
bias_hh_l0 torch.Size([32]) <class 'torch.nn.parameter.Parameter'>


## 验证

In [25]:
torch.manual_seed(1024)

batch_size = 2
sentence_len = 5
words_num = 10
words_len = 4

hidden_size=8
num_layers=1

X = torch.randint(0,words_num,(batch_size,sentence_len))
em = nn.Embedding(words_num,words_len)
X = em(X)
print(X.shape)

print('pytorch部分')
rnn = nn.RNN(
            input_size=words_len,

            hidden_size=hidden_size,
            num_layers=num_layers,

            batch_first=True,
        )
out,outh = rnn(X)
print(out.shape,outh.shape)
print(outh[0,0,:],'\n',out[0,0,:])
print('-------------------')


print('MyCNN部分')
class MyRNN(nn.Module):
    def __init__(self):
        super().__init__()


        self.hidden_layer_x = nn.Linear(words_len,hidden_size) #rnn关键部分
        self.hidden_layer_h = nn.Linear(hidden_size,hidden_size) #h用

        #注意这里无条件复制w、b，所以最好先判断两者形状相等再赋值
        params = list(rnn.parameters())
        assert self.hidden_layer_x.weight.shape == params[0].shape and \
                self.hidden_layer_h.weight.shape == params[1].shape and \
                self.hidden_layer_x.bias.shape == params[2].shape and \
                self.hidden_layer_h.bias.shape == params[3].shape,\
                print("shape error")

        self.hidden_layer_x.weight = params[0]
        self.hidden_layer_h.weight = params[1]
        self.hidden_layer_x.bias = params[2]
        self.hidden_layer_h.bias = params[3]

    def __init_hidden(self):
        return torch.zeros([num_layers,batch_size,hidden_size])

    def forward(self,x):
        self.hidden = self.__init_hidden()

        out = torch.zeros([sentence_len,batch_size,hidden_size])

        x = x.transpose(0,1)
        for i in range(x.shape[0]):
            a1 = self.hidden_layer_x(x[i])
            a2 = self.hidden_layer_h(self.hidden[0])
            self.hidden[0] = nn.Tanh()(a1+a2)


            out[i] = self.hidden[0]

        return out.transpose(0,1)

myrnn = MyRNN()
out = myrnn(X)
print(out.shape,myrnn.hidden.shape)
print(myrnn.hidden[0,0,:],'\n',out[0,0,:])

torch.Size([2, 5, 4])
pytorch部分
torch.Size([2, 5, 8]) torch.Size([1, 2, 8])
tensor([-0.7137,  0.5926, -0.6244,  0.6198, -0.1706, -0.5207, -0.0612,  0.4735],
       grad_fn=<SliceBackward0>) 
 tensor([-0.8456,  0.2111,  0.1764,  0.7712, -0.0146, -0.7987, -0.6467,  0.7652],
       grad_fn=<SliceBackward0>)
-------------------
MyCNN部分
torch.Size([2, 5, 8]) torch.Size([1, 2, 8])
tensor([-0.7137,  0.5926, -0.6244,  0.6198, -0.1706, -0.5207, -0.0612,  0.4735],
       grad_fn=<SliceBackward0>) 
 tensor([-0.8456,  0.2111,  0.1764,  0.7712, -0.0146, -0.7987, -0.6467,  0.7652],
       grad_fn=<SliceBackward0>)


# LSTM
## 基本理解
- rnn因为激活层是Tanh，显然，当前信息几乎不会对很远的计算产生影响，即只能短期记忆
- lstm中引入可以选择“记忆”和“当前”的信息对当前输出的占比 
## 具体介绍 
- 输入是H（类似于rnn的hidden作用），C（当前记忆）
- 一个单元分为记忆门，遗忘门，rnn门，输出门，其相当于四次rnn变换，只不过功能和激活函数不同
   - 记忆门i，Sigmoid激活，即当前rnn输出需要记哪些信息到记忆C中
   - 遗忘门f，Sigmoid激活，即原先记忆需要遗忘哪些信息
   - rnn门g，Tanh激活，就是rnn
   - 输出门o，Sigmoid激活，控制新记忆哪些作为新H
- 公式如下
$$
    \begin{array}{ll} \\
        i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
        f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
        o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
        c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
        h_t = o_t \odot \tanh(c_t) \\
    \end{array}
$$
## pytorch内部细节
- 将四个变换按维度一放在一起，同时计算四个门激活前的输出，输出顺序如上顺序，见如下代码

In [2]:
torch.manual_seed(1024)

batch_size = 2
sentence_len = 5
words_num = 10
words_len = 4

hidden_size=16
num_layers=1

X = torch.randint(0,words_num,(batch_size,sentence_len))
em = nn.Embedding(words_num,words_len)
X = em(X)
print(X.shape)

print('pytorch部分')
lstm = nn.LSTM(
            input_size=words_len,

            hidden_size=hidden_size,
            num_layers=num_layers,

            batch_first=True,
        )
H = torch.zeros([num_layers,batch_size,hidden_size])
C = torch.zeros([num_layers,batch_size,hidden_size])
out,(outh,outc) = lstm(X,(H,C)) #H、C默认为0
print(out.shape,outh.shape,outc.shape)
print(out[0,0,:])
print('-------------------')


print('MyLSTM部分')
class MyLSTM(nn.Module):
    def __init__(self):
        super().__init__()

        self.hidden_layer_x = nn.Linear(words_len,hidden_size*4) #4*32，即同时输出4组
        self.hidden_layer_h = nn.Linear(hidden_size,hidden_size*4) #8*32

        params = list(lstm.parameters())
        assert self.hidden_layer_x.weight.shape == params[0].shape and \
                self.hidden_layer_h.weight.shape == params[1].shape and \
                self.hidden_layer_x.bias.shape == params[2].shape and \
                self.hidden_layer_h.bias.shape == params[3].shape, \
                print("shape error")

        self.hidden_layer_x.weight = params[0]
        self.hidden_layer_h.weight = params[1]
        self.hidden_layer_x.bias = params[2]
        self.hidden_layer_h.bias = params[3]

    def __init_H_C(self):
        return torch.zeros([num_layers,batch_size,hidden_size]),torch.zeros([num_layers,batch_size,hidden_size])


    def forward(self,x):
        self.H,self.C = self.__init_H_C()
        #H是输出
        #C是记忆

        out = torch.zeros([sentence_len,batch_size,hidden_size])

        x = x.transpose(0,1)
        for i in torch.arange(x.shape[0]):

            a1 = self.hidden_layer_x(x[i])
            a2 = self.hidden_layer_h(self.H[0])
            a = a1+a2

            remember_gate = nn.Sigmoid()(a[:,:hidden_size*1]) #记住下面op什么信息
            forget_gate = nn.Sigmoid()(a[:,hidden_size*1:hidden_size*2]) #C遗忘什么信息
            op = nn.Tanh()(a[:,hidden_size*2:hidden_size*3])
            output_gate = nn.Sigmoid()(a[:,hidden_size*3:]) #最终输出
			
            self.C[0] = self.C[0]*forget_gate + remember_gate*op #新的记忆
            self.H[0] = nn.Tanh()(self.C)*output_gate #生成新的输出

            out[i] = self.H[0]

        return out.transpose(0,1)

mylstm = MyLSTM()
out = mylstm(X)
print(out.shape,mylstm.C.shape,mylstm.H.shape)
print(out[0,0,:])

torch.Size([2, 5, 4])
pytorch部分
torch.Size([2, 5, 16]) torch.Size([1, 2, 16]) torch.Size([1, 2, 16])
tensor([ 0.0842,  0.0372, -0.1389,  0.0300, -0.1425,  0.0223,  0.0017,  0.3239,
        -0.0413, -0.0924, -0.1835,  0.0621,  0.0484, -0.0894,  0.2133,  0.0063],
       grad_fn=<SliceBackward0>)
-------------------
MyLSTM部分
torch.Size([2, 5, 16]) torch.Size([1, 2, 16]) torch.Size([1, 2, 16])
tensor([ 0.0842,  0.0372, -0.1389,  0.0300, -0.1425,  0.0223,  0.0017,  0.3239,
        -0.0413, -0.0924, -0.1835,  0.0621,  0.0484, -0.0894,  0.2133,  0.0063],
       grad_fn=<SliceBackward0>)


## 优化
- 遗忘门=1-记忆门，减少运算次数
- 。。。

# transformers包

## pipeline

In [29]:
from transformers import pipeline
question_answerer = pipeline("question-answering") #内部有模型，用于回答问题的

context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
"""
question_answerer(question="What is extractive question answering?", context=context)

#中文版
from transformers import AutoModelForQuestionAnswering,AutoTokenizer,pipeline
model = AutoModelForQuestionAnswering.from_pretrained('uer/roberta-base-chinese-extractive-qa')
tokenizer = AutoTokenizer.from_pretrained('uer/roberta-base-chinese-extractive-qa')
zh_qa = pipeline("question-answering", model=model, tokenizer=tokenizer)
QA_input = {
    'question': "著名诗歌《假如生活欺骗了你》的作者是",
    'context': "普希金从那里学习人民的语言，吸取了许多有益的养料，\
        这一切对普希金后来的创作产生了很大的影响。这两年里，普希金创作了不少优秀的作品，如《囚徒》、\
            《致大海》、《致凯恩》和《假如生活欺骗了你》等几十首抒情诗，叙事诗《努林伯爵》，历史剧\
                《鲍里斯·戈都诺夫》，以及《叶甫盖尼·奥涅金》前六章。"
                }
zh_qa(QA_input)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

KeyboardInterrupt: 

## Tokenizer

In [33]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") #https://huggingface.co/有很多模型
sen = "你好吗，吃饭了吗?"
tokens = tokenizer.tokenize(sen)
print(tokens)

sens = ["答案是不需要","完全不需要任何额外操作","多条数据和单条数据一样进行调用即可."]
res = tokenizer(
    sens, 
    padding="max_length", #不足补齐
    max_length=15,
    truncation=True #超过截断
    )
print(res)

['你', '好', '吗', '，', '吃', '饭', '了', '吗', '?']
{'input_ids': [[101, 5031, 3428, 3221, 679, 7444, 6206, 102, 0, 0, 0, 0, 0, 0, 0], [101, 2130, 1059, 679, 7444, 6206, 818, 862, 7583, 1912, 3082, 868, 102, 0, 0], [101, 1914, 3340, 3144, 2945, 1469, 1296, 3340, 3144, 2945, 671, 3416, 6822, 6121, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


# 文本预处理

## df处理

In [2]:
base_dir = "E:/DATA/feedback-prize-effectiveness/"

In [7]:
df = pd.read_csv(base_dir+"/train.csv")
df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [8]:
df[['discourse_type','discourse_effectiveness']].apply(pd.unique)

discourse_type             [Lead, Position, Claim, Evidence, Counterclaim...
discourse_effectiveness                   [Adequate, Ineffective, Effective]
dtype: object

In [9]:
print(len(pd.unique(df['essay_id'])))
for _,_,files in os.walk(base_dir+"/train"):
    print(len(files))
    break

4191
4191


In [7]:
base_dir = "E:/DATA/feedback-prize-effectiveness/"
df = pd.read_csv(base_dir+"/train.csv")

needed_col = ['essay_id','discourse_text','discourse_type','discourse_effectiveness']
df = df[needed_col]
df.columns=['id','text','type','ef']

typ = {'Lead':1,'Position':2, 'Claim':3, 'Evidence':4, 'Counterclaim':5, 'Rebuttal':6, 'Concluding Statement':7}
eff = {'Adequate':1, 'Ineffective':2, 'Effective':3}
df['type'] = df['type'].apply(lambda x:typ[x])
df['ef'] = df['ef'].apply(lambda x:eff[x])

#text处理
tk = AutoTokenizer.from_pretrained('bert-base-cased',use_fast=True)
tk.max_len=384
df['text'] = df['text'].apply(lambda x:tk(x,truncation=True))

df.iloc[0,1]

{'input_ids': [101, 8790, 117, 178, 112, 182, 7026, 117, 178, 112, 182, 1280, 1106, 1129, 2269, 1164, 1293, 1142, 1339, 1113, 7403, 1110, 170, 2379, 1657, 13199, 1137, 1191, 1175, 1110, 1297, 1113, 7403, 1115, 1189, 1122, 119, 1109, 1642, 1110, 1164, 1293, 9085, 1261, 170, 3439, 1104, 7403, 1105, 170, 1339, 1108, 1562, 1113, 1103, 5015, 119, 9085, 2144, 112, 189, 1221, 1191, 1103, 1657, 13199, 1108, 1687, 1118, 1297, 1113, 7403, 117, 1137, 1191, 1122, 1110, 1198, 170, 2379, 1657, 13199, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

# 开始

## 基本参数

In [24]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

base_dir = "E:/DATA/feedback-prize-effectiveness/"
modules_dir = './modules/'
if not os.path.exists(modules_dir):
    os.mkdir(modules_dir)
    print('modules_dir已创建')
else:
    for i in os.listdir(modules_dir):
        c_path = os.path.join(modules_dir, i)
        os.remove(c_path)
    print('modules_dir已存在，已将其清空')

input_len = 384

train_batch_size=64
test_batch_size=128
epochs=15

seed=101
def set_seed():
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed()

exp_name='exp1'
import shutil
if os.path.exists(exp_name):
    shutil.rmtree(exp_name)

cpu
modules_dir已存在，已将其清空


In [9]:
data_mode='debug'

## dataloader

In [25]:
tk = AutoTokenizer.from_pretrained('bert-base-cased',use_fast=True)
tk.max_len=input_len

class DfProc():
    def __init__(self, data_mode='debug'):

        self.data_mode = data_mode
        assert self.data_mode in ['debug','mini','all'],print("mode值错误")

        self.__Set_Df()


    def __Set_Df(self):

        #train
        df = pd.read_csv(base_dir+"/train.csv")

        #这里的策略是将type列作为输入文本的开头，则type列不再需要
        df['discourse_text'] = df['discourse_type']+ tk.sep_token + df['discourse_text'] #特殊符号

        #skf = StratifiedGroupKFold(5)
        #for i, (train_i,valid_i) in enumerate(skf.split(df,df['discourse_type'],groups=df['essay_id'])):
        #    df.loc[valid_i,'fold'] = i+1


        needed_col = ['essay_id','discourse_text','discourse_effectiveness']
        df = df[needed_col]
        df.columns=['id','text','ef']

        eff = {'Adequate':1, 'Ineffective':2, 'Effective':3}
        df['ef'] = df['ef'].apply(lambda x:eff[x])

        #text处理
        df['text'] = df['text'].apply(lambda x:tk(x,truncation=True))

        #train_df = df[df['fold']!=1]
        #valid_df = df[df['fold']==1]
        train_df = df


        if self.data_mode == 'all':
            self.train_df = train_df
            #self.valid_df = valid_df
        elif self.data_mode == 'debug':
            self.train_df = train_df[:2*train_batch_size]
            #self.valid_df = valid_df[:2*test_batch_size]
        else:
            self.train_df = train_df[:int(0.3*len(train_df))]
            #self.valid_df = valid_df[:int(0.3*len(valid_df))]

        #test
        df = pd.read_csv(base_dir+"/test.csv")

        df['discourse_text'] = df['discourse_type']+ tk.sep_token + df['discourse_text']

        needed_col = ['essay_id', 'discourse_text']
        df = df[needed_col]
        df.columns=['id','text']

        df['text'] = df['text'].apply(lambda x:tk(x,truncation=True))

        if self.data_mode == 'all':
            self.test_df = df
        elif self.data_mode == 'debug':
            self.test_df = df[:2*test_batch_size]
        else:
            self.test_df = df[:int(0.3*len(df))]

## model

In [27]:
def get_trainer(dfProc):
    args = TrainingArguments(
        'outputs', 
        learning_rate=8e-5, 
        warmup_ratio=0.1, 
        lr_scheduler_type='cosine', 
        fp16=True,
        evaluation_strategy="epoch", 
        per_device_train_batch_size=train_batch_size, 
        per_device_eval_batch_size=test_batch_size,
        num_train_epochs=epochs, 
        weight_decay=0.01, 
        report_to='none'
        )
    model = AutoModelForSequenceClassification.from_pretrained(
        'bert-base-cased', 
        num_labels=3
        )
    return Trainer(
        model, 
        args, 
        train_dataset=dfProc.train_df, 
        eval_dataset=dfProc.test_df,
        tokenizer=tk, 
        compute_metrics=lambda prey,y:nn.BCEWithLogitsLoss()(prey,y)
        )
trainer = get_trainer(DfProc())

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

KeyboardInterrupt: 