In [36]:
import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import AutoTokenizer,AutoModelForSequenceClassification 
from torch.utils.data import DataLoader
from torch.utils.tensorboard  import SummaryWriter

from sklearn.model_selection import train_test_split

import pandas as pd
from tqdm import tqdm


In [37]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = SummaryWriter()

In [38]:
df = pd.read_excel('E:\\workspacepythonlearn\\jd_comment_data.xlsx')
df.head()

Unnamed: 0,爬取时间(__time),爬取链接(__url),商品ID(product_id),评价时间(publish_time),评分（总分5分）(score),评价内容(content),评价者(author_name),评价者会员等级(author_level),商品sku(product_sku),评价标签(tags)
0,2019-03-08 00:50:34,https://sclub.jd.com/comment/productPageCommen...,4722324,1550631798,5,此用户未填写评价内容,j***e,注册会员,单机版 小D黑色,[]
1,2019-03-08 00:50:34,https://sclub.jd.com/comment/productPageCommen...,4722324,1550633151,5,此用户未填写评价内容,c***n,钻石会员,单机版 小D黑色,[]
2,2019-03-08 00:50:34,https://sclub.jd.com/comment/productPageCommen...,4722324,1550633330,3,此用户未填写评价内容,j***1,银牌会员,单机版 小D黑色,[]
3,2019-03-08 00:50:34,https://sclub.jd.com/comment/productPageCommen...,4722324,1550633401,5,此用户未填写评价内容,苗***4,钻石会员,单机版 小D黑色,[]
4,2019-03-08 00:50:33,https://sclub.jd.com/comment/productPageCommen...,4722324,1550633461,5,此用户未填写评价内容,J***3,注册会员,单机版 小D黑色,[]


In [39]:
df.columns

Index(['爬取时间(__time)', '爬取链接(__url)', '商品ID(product_id)', '评价时间(publish_time)',
       '评分（总分5分）(score)', '评价内容(content)', '评价者(author_name)',
       '评价者会员等级(author_level)', '商品sku(product_sku)', '评价标签(tags)'],
      dtype='object')

In [40]:
filterd =df['评价内容(content)'] !="此用户未填写评价内容" # 过滤掉没有评价内容的数据，结果是true或者false
data_df = df[filterd][['评价内容(content)','评分（总分5分）(score)']] #获取评价内容与评分


In [41]:
data_df.head()

Unnamed: 0,评价内容(content),评分（总分5分）(score)
15,一般般，一分钱一分货吧,1
18,商品质量很好，很满意，配送速度快啊，而且配送员态度也非常好。,4
19,。。。,5
22,刘慧敏提莫摸摸摸休息泽TCL退咯的一组婆婆破鼓规土局,5
25,还好还好还好还好红红火火好很好好,5


In [42]:
data=data_df.values
data

array([['一般般，一分钱一分货吧', 1],
       ['商品质量很好，很满意，配送速度快啊，而且配送员态度也非常好。', 4],
       ['。。。', 5],
       ...,
       ['没有色差，穿上很舒服，到货快。', 5],
       ['可以', 5],
       ['物有所值 客服贴心东西收到以后马上查看，发现与图片描述一致，超级喜欢，卖家发货 速度很快 ，，服务也很到位，给老板点个赞，下次还会来购买.........',
        5]], shape=(44422, 2), dtype=object)

In [43]:
train ,test =train_test_split(data)

In [44]:
tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-bert-wwm')

 

In [46]:
def warp_data(batch_data):
    comment,lablels = [],[]
    for bdate in batch_data:
        comment.append(bdate[0])
        lablels.append(int(bdate[1])-1)

    #输入数据
    input_data=tokenizer(comment,return_tensors='pt',padding=True,max_length=512)
    labels_data = torch.tensor(lablels)
    return input_data,labels_data

train_dl=DataLoader(train,batch_size=4,shuffle=True,collate_fn=warp_data)
test_dl=DataLoader(test,batch_size=4,shuffle=True,collate_fn=warp_data)

In [47]:
for item in test_dl:
    print(item)
    break


({'input_ids': tensor([[ 101, 2523, 4023,  778, 2523,  912, 2139, 3302, 1218,  738, 2523, 1962,
         7478, 2382, 4007, 2692,  106,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0],
        [ 101, 2523, 1962, 1452, 8024, 2523, 1962, 4692, 8024, 3344, 2094, 2523,
         1962, 8024, 1259, 6163, 4638,  738, 2523, 1962, 8024, 2571, 6853,  738,
         2523, 2571, 8024, 2145, 3302,  738, 2523, 1962,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0],
        [ 101, 6862, 2428, 2571, 8024, 6574, 7030, 1962, 8024, 5439, 3322, 1690,
         1285, 5277, 1400, 2571, 4638, 7607, 6629, 8024, 6820,



In [48]:
model_1 =AutoModelForSequenceClassification.from_pretrained("hfl/chinese-bert-wwm", num_labels=5)
model_2 =AutoModelForSequenceClassification.from_pretrained("hfl/chinese-bert-wwm", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-bert-wwm and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-bert-wwm and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
model_1 = model_1.to(device)
model_2 = model_2.to(device)
model_2.bert.trainable = False

In [50]:
loss_fn1 = nn.CrossEntropyLoss()
optimizer1=Adam(model_1.parameters(),lr=1e-4)

loss_fn2 = nn.CrossEntropyLoss()
optimizer1=Adam(model_2.parameters(),lr=1e-4)

In [None]:
model_1_train_loss_cnt  = 0

for epoch in range(3):
    pbar = tqdm(train_dl)
    for input_data,labels_data in pbar:
        datas =  {k:v.to(device) for k,v in input_data.items()}
        labels = labels_data.to(device)

        result = model_2(**datas)
        loss = loss_fn2(result.logits,labels)

        pbar.set_description(f"Epoch: {epoch} trainLoss: {loss.item():.4f}")

        writer.add_scalar("train_loss",loss.item(),model_1_train_loss_cnt)
        model_1_train_loss_cnt += 1

        loss.backward()
        optimizer1.step()

        model_2.zero_grad()


torch.save(model_2.state_dict(),"model_1.pt")



Epoch: 0 trainLoss: 0.9587:  42%|████▏     | 3503/8329 [01:44<02:24, 33.51it/s]


RuntimeError: The size of tensor a (626) must match the size of tensor b (512) at non-singleton dimension 1