In [66]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import random_split, DataLoader,Dataset
import re
import torch.nn.functional as F
import gensim
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence,pad_packed_sequence

In [67]:
word2vec_model=gensim.models.keyedvectors.load_word2vec_format('data/Comment/wiki_word2vec_50.bin',binary=True)

In [68]:
#wiki和train没有包含关系
#制作新数据集的词表
words=[]
with open('data/Comment/train.txt','r',encoding='UTF-8') as f:
    lines=f.readlines()
    for line in lines:
        line_words = re.split(r'[\s]', line)[1:-1]
        for w in line_words:
                words.append(w)

words=sorted(set(words))#words是新的词表 ,需要word2ix,ix2word,和ix2vector
word2ix={w:i+1 for i,w in enumerate(words)}
ix2word={i+1:w for i ,w in enumerate(words)}
word2ix['UNK']=0
ix2word[0]='UNK'
#UNK的含义是test和valid还没见过的集合中可能存在的未知数

In [69]:
#先做一个基础数据集，它每个数据的长短不同，是那句话的长度，不能直接进dataloader
class MyComment2(Dataset):
    def __init__(self,path,word2ix):
        self.word2ix=word2ix
        self.labels=[]
        self.comments=[]
        with open(path,'r',encoding='UTF-8') as f:
            lines=f.readlines()
        for line in lines:
            line=re.split(r'[\s]',line)
            self.labels.append(int(line[0]))
            comment=line[1:-1]
            comment_dig=[]
            for i in comment:
                try:
                    comment_dig.append(word2ix[i])
                except KeyError:
                    comment_dig.append(0)
            self.comments.append(torch.tensor(comment_dig,dtype=torch.int64))
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self,num):
        return self.comments[num],self.labels[num]
    
#精加工，返回的是补齐的tensor格式，comment和lengths进lstm
def mycollate_fn(batch):
    batch.sort(key=lambda x:len(x[0]),reverse=True)
    lengths=[len(i[0]) for i in batch]
    labels=torch.tensor([i[1] for i in batch],dtype=torch.long)
    comment=[i[0] for i in batch]
    comment=pad_sequence(comment,batch_first=True,padding_value=0)
    return comment,lengths,labels

In [43]:
#制作dataloader
train_ds=MyComment2(path='data/Comment/train.txt',word2ix=word2ix)
train_loader=DataLoader(train_ds,40,shuffle=True,collate_fn=mycollate_fn)
#train_loader是经过embed的向量，并且不打算修改embed参数，直接训练一个lstm

val_ds=MyComment2(path='data/Comment/validation.txt',word2ix=word2ix)
val_loader=DataLoader(val_ds,40,shuffle=True,collate_fn=mycollate_fn)
test_ds=MyComment2(path='data/Comment/test.txt',word2ix=word2ix)
test_loader=DataLoader(test_ds,40,shuffle=True,collate_fn=mycollate_fn)


In [71]:
print(len(word2ix))
#ix-->WORD-->word2vec_model.key_to_index[...]-->model.getvector(.)
#没有的词初始化为0向量
my_weight=[]
for i in range(len(word2ix)):
    word=ix2word[i]
    try:
        index=word2vec_model.key_to_index[word]
        my_weight.append(torch.tensor(word2vec_model.get_vector(index)))
    except KeyError:
        my_weight.append(torch.zeros(50))
my_weight=torch.stack(my_weight)

53338


In [72]:
#可选是否要把长度信息提供给lstm
#可选使用hidden做分类还是output做分类
#可选是否对初始的矩阵进行调整

def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

class FeelingCatcher(nn.Module):
            
    def __init__(self,hid_dim,n_layers, renew ,length_change, who):
        super(FeelingCatcher,self).__init__()
        
        if who!='hidden'and who!='output' and who!='output_last':
            print('who是指用谁进行判断，请在hidden，output，output_last中做选择')
            return -1
        
        self.embedding=nn.Embedding.from_pretrained(my_weight)
        self.embedding.requires_grad_=renew  #决定是否学习新的词向量
        
        self.lstm=nn.LSTM(50,hid_dim,n_layers,batch_first=True,dropout=0.2)
        #lstm——hidden——【batchsize，hid_dim,n_layers】
        #lstm----outputs----[batchsize,seq_len,hid_dim]
        
        self.dropout=nn.Dropout(0.2)
        self.fc1=nn.Linear(hid_dim,256)
        self.fc2=nn.Linear(256,64)
        self.fc3=nn.Linear(64,2)
        
        self.length_change=length_change# 是否接受不定长输入
        self.who=who #选谁判断

        
    def forward(self,comments,lengths):
        out=self.embedding(comments)
        
        if self.length_change:
            out=pack_padded_sequence(out,lengths,batch_first=True)
            out,hidden=self.lstm(out)  #hidden[n_layers,batchsize,,hidden]
            out=pad_packed_sequence(out,batch_first=True)[0] #[batch,seq_len,hidden_dim]
        else:
            out,hidden=self.lstm(out)
            
        if self.who=='output_last':
            out=out[:,-1,:]   #[batch,hidden_dim]

        if self.who=='output':
            out=torch.sum(out,dim=1)/out.shape[1]

        if self.who=='hidden':
            out=hidden[1][-1,:,:]
            
            
        out=self.dropout(torch.tanh(self.fc1(out)))
        out=self.dropout(torch.tanh(self.fc2(out)))
        out=self.fc3(out)
        
        return out   #[batch,2]
    
    
    def step(self, batch, device):  #返回一个batch的loss和acc
        comments, lengths, labels = batch
        comments=comments.to(device)
        labels=labels.to(device)
        
        out = self(comments,lengths)
        loss = F.cross_entropy(out, labels)
        acc = accuracy(out, labels)
        return {'loss': loss, 'acc': acc}
    
        
    def evaluate(self, loader,device):     #打包评价这轮valloader中loss和acc的平均值
        outputs = [self.step(batch,device) for batch in loader]
        losses = [x['loss'] for x in outputs]
        losses = torch.stack(losses).mean()
        accs = [x['acc'] for x in outputs]
        accs = torch.stack(accs).mean()
        return {'loss': losses.item(), 'acc': accs.item()}

    
    def epoch_end(self, epoch, train_loss,result):  #打印训练，测试上的loss，acc
        print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch, train_loss, result['loss'], result['acc']))

In [73]:
def fit(epochs, max_lr, model, train_loader, val_loader, grad_clip=None, opt_func=torch.optim.Adam,device='cpu'):
    
    history_train = []
    history_val=[]
    #history_lr=[]
    optimizer = opt_func(model.parameters(), max_lr,weight_decay = 1e-4)
    #sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs, steps_per_epoch=len(train_loader))
    
    for epoch in range(epochs):
        model.train()
        
        for batch in train_loader:
            loss = model.step(batch,device)['loss']
            loss.backward()
            if grad_clip: 
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)
            optimizer.step()
            optimizer.zero_grad()
            #history_lr.append(get_lr(optimizer))
            #sched.step()
            
        model.eval()
        with torch.no_grad():
            result = model.evaluate( val_loader,device)
            model.epoch_end(epoch,loss.item(),result)
            
            history_train.append(loss.item())
            history_val.append(result['loss'])
            
    return history_train,history_val

In [69]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model=FeelingCatcher(128,3,renew=True,length_change=True,who='output_last').to(device)

history_train,history_val=fit(1,0.01,model,train_loader,val_loader,device=device)

KeyboardInterrupt: 

In [106]:
import jieba

def predict(x,path='comment.pth'):
    comment=x.split('，')
    seg=[' '.join(jieba.cut(i,cut_all=False)) for i in comment]
    num_seg=[]
    for i in seg:
        j=i.split(' ')
        for l in j:
            try:
                num_seg.append(word2ix[l])
            except KeyError:
                num_seg.append(0)
    num_seg=torch.tensor(num_seg)
    num_seg=num_seg.unsqueeze(0)
    seg_len=[num_seg.size()[1]]

    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model=FeelingCatcher(128,3,renew=True,length_change=True,who='output_last').to(device)
    model.load_state_dict(torch.load(path,map_location='cpu'))

    pred=torch.argmax(model(num_seg,seg_len))
    if pred:
        print(x,'\n','是差评55555')
    else:
        print(x,'\n','是好评！')

In [112]:
predict('整部片子全是变态。一个不良少女，一个弱智杀手，一个丧心病狂的警察。我怎么会看这么毁三观的片子。')
print('------------------------------------------------------------------------')
predict('里昂只有一颗盆栽，不善言辞，爱喝牛奶。他不像，却真正是一个杀手。玛蒂达的到来，是包袱，也给里昂带来了生机。不过这种设定，注定是悲剧收场。里昂死后，玛蒂达将他盆栽的种子落地生根，里昂终于不再每日拿着手枪在椅子上不安地入睡，他落地了。娜塔莉波特曼太灵了，玛蒂达是如此特别。')

整部片子全是变态。一个不良少女，一个弱智杀手，一个丧心病狂的警察。我怎么会看这么毁三观的片子。 
 是差评55555
------------------------------------------------------------------------
里昂只有一颗盆栽，不善言辞，爱喝牛奶。他不像，却真正是一个杀手。玛蒂达的到来，是包袱，也给里昂带来了生机。不过这种设定，注定是悲剧收场。里昂死后，玛蒂达将他盆栽的种子落地生根，里昂终于不再每日拿着手枪在椅子上不安地入睡，他落地了。娜塔莉波特曼太灵了，玛蒂达是如此特别。 
 是好评！


['评分超高的影片', '看完毫无感觉', '明明很一般啊', '嗯', '一定是我不懂得欣赏'] 
 是好评！
