In [41]:
import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split

import pandas as pd
from tqdm import tqdm 

In [42]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = SummaryWriter()

In [43]:
df = pd.read_excel('/kaggle/input/jd_comment_with_label/jd_comment_data.xlsx')

df.head()

Unnamed: 0,爬取时间(__time),爬取链接(__url),商品ID(product_id),评价时间(publish_time),评分（总分5分）(score),评价内容(content),评价者(author_name),评价者会员等级(author_level),商品sku(product_sku),评价标签(tags)
0,2019-03-08 00:50:34,https://sclub.jd.com/comment/productPageCommen...,4722324,1550631798,5,此用户未填写评价内容,j***e,注册会员,单机版 小D黑色,[]
1,2019-03-08 00:50:34,https://sclub.jd.com/comment/productPageCommen...,4722324,1550633151,5,此用户未填写评价内容,c***n,钻石会员,单机版 小D黑色,[]
2,2019-03-08 00:50:34,https://sclub.jd.com/comment/productPageCommen...,4722324,1550633330,3,此用户未填写评价内容,j***1,银牌会员,单机版 小D黑色,[]
3,2019-03-08 00:50:34,https://sclub.jd.com/comment/productPageCommen...,4722324,1550633401,5,此用户未填写评价内容,苗***4,钻石会员,单机版 小D黑色,[]
4,2019-03-08 00:50:33,https://sclub.jd.com/comment/productPageCommen...,4722324,1550633461,5,此用户未填写评价内容,J***3,注册会员,单机版 小D黑色,[]


In [44]:
df.columns

Index(['爬取时间(__time)', '爬取链接(__url)', '商品ID(product_id)', '评价时间(publish_time)',
       '评分（总分5分）(score)', '评价内容(content)', '评价者(author_name)',
       '评价者会员等级(author_level)', '商品sku(product_sku)', '评价标签(tags)'],
      dtype='object')

In [45]:
filterd= df['评价内容(content)'] != "此用户未填写评价内容"
data_df = df[filterd][['评价内容(content)','评分（总分5分）(score)']]

In [46]:
data_df.head()

Unnamed: 0,评价内容(content),评分（总分5分）(score)
15,一般般，一分钱一分货吧,1
18,商品质量很好，很满意，配送速度快啊，而且配送员态度也非常好。,4
19,。。。,5
22,刘慧敏提莫摸摸摸休息泽TCL退咯的一组婆婆破鼓规土局,5
25,还好还好还好还好红红火火好很好好,5


In [47]:
data = data_df.values
data

array([['一般般，一分钱一分货吧', 1],
       ['商品质量很好，很满意，配送速度快啊，而且配送员态度也非常好。', 4],
       ['。。。', 5],
       ...,
       ['没有色差，穿上很舒服，到货快。', 5],
       ['可以', 5],
       ['物有所值 客服贴心东西收到以后马上查看，发现与图片描述一致，超级喜欢，卖家发货 速度很快 ，，服务也很到位，给老板点个赞，下次还会来购买.........',
        5]], dtype=object)

In [48]:
train ,test = train_test_split(data)

In [49]:
print(train.shape)
print(test.shape)

(33316, 2)
(11106, 2)


In [50]:
# 分词器
tokenzier = AutoTokenizer.from_pretrained('hfl/chinese-bert-wwm')

In [51]:
# 自定义DataLoader创建方法

def warp_data(batch_data):
    comments, lables = [],[]
    for bdate in batch_data:
        comments.append(bdate[0])
        lables.append(int(bdate[1])-1)  # 标签取值[0-4]
    
    # 转换模型输入数据
    input_data = tokenzier(comments, return_tensors='pt', padding=True, truncation=True, max_length=512)
    labels_data = torch.tensor(lables)
    
    return input_data, labels_data

train_dl = DataLoader(train, batch_size=20, shuffle=True, collate_fn = warp_data)
test_dl = DataLoader(test, batch_size=20, shuffle=False, collate_fn = warp_data)

In [12]:
# for item in test_dl:
#     print(item)
#     break


({'input_ids': tensor([[  101,  6132,  3302,  3119,  1168,   749,  8024,  3043,   677,  1343,
          7481,  3160,  2523,  5653,  3302,  8024,   817,  3419,  2141,  2669,
          6574,  7030,  1348,  1962,  8024,  7478,  2382,  4007,  2692,   511,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  6574,  7030,  2345,   510,   679,  5543,  4500,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  4717,  6132,  3119,  1168,   749,  8024,  1157,  1962,  1394,
          6716,  8024,  7481,  3160,   738,  5653,  3302,  8013,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   

In [52]:
# model_1 模型微调 Supervised Fine Tuning
# model_2 迁移学习 Transfer Learning 冻结bert
model_1 = AutoModelForSequenceClassification.from_pretrained('hfl/chinese-bert-wwm', num_labels=5)
model_2 = AutoModelForSequenceClassification.from_pretrained('hfl/chinese-bert-wwm', num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-bert-wwm and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-bert-wwm and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
model_1 = model_1.to(device)
model_2 = model_2.to(device)
model_2.bert.trainable = False

In [54]:
# loss、optim
loss_fn1 = nn.CrossEntropyLoss()
optim1 = Adam(model_1.parameters(), lr=1e-4)

loss_fn2 = nn.CrossEntropyLoss()
optim2 = Adam(model_2.parameters(), lr=1e-3)


In [57]:
model1_train_loss_cnt = 0

for epoch in range(3):
    pbar = tqdm(train_dl)
    for input_data, labels_data in pbar:
        datas = { k:v.to(device) for k,v in input_data.items() }
        labels = labels_data.to(device)
        
        result = model_1(**datas)
        loss = loss_fn1(result.logits, labels)
        
        pbar.set_description(f'epoch:{epoch} train_loss:{loss.item():.4f}')

        writer.add_scalar("Fine Tuning Train Loss", loss, model1_train_loss_cnt)
        model1_train_loss_cnt += 1
        
        loss.backward()
        optim1.step()
        
        model_1.zero_grad()


torch.save(model_1.state_dict(),'model_1.pt')
        

epoch:0 train_loss:0.1528: 100%|██████████| 1666/1666 [06:26<00:00,  4.31it/s]
epoch:1 train_loss:0.6936: 100%|██████████| 1666/1666 [06:26<00:00,  4.31it/s]
epoch:2 train_loss:0.2600: 100%|██████████| 1666/1666 [06:27<00:00,  4.29it/s]


In [58]:
model2_train_loss_cnt = 0

for epoch in range(3):
    pbar = tqdm(train_dl)
    for input_data, labels_data in pbar:
        datas = { k:v.to(device) for k,v in input_data.items() }
        labels = labels_data.to(device)
        
        result = model_2(**datas)
        loss = loss_fn2(result.logits, labels)
        
        pbar.set_description(f'epoch:{epoch} train_loss:{loss.item():.4f}')

        writer.add_scalar("Transfer Learning Train Loss", loss, model2_train_loss_cnt)
        model2_train_loss_cnt += 1
        
        loss.backward()
        optim2.step()
        
        model_2.zero_grad()


torch.save(model_2.state_dict(),'model_2.pt')

epoch:0 train_loss:0.5336: 100%|██████████| 1666/1666 [06:28<00:00,  4.29it/s]
epoch:1 train_loss:0.0677: 100%|██████████| 1666/1666 [06:22<00:00,  4.36it/s]
epoch:2 train_loss:0.1532: 100%|██████████| 1666/1666 [06:27<00:00,  4.30it/s]


In [59]:
model_1.eval()
model_2.eval()
pbar = tqdm(test_dl)


correct1, correct2 = 0,0

for input_data, labels_data in pbar:
    datas = { k:v.to(device) for k,v in input_data.items() }
    labels = labels_data.to(device)

    with torch.no_grad():
        result1 = model_1(**datas)
        result2 = model_2(**datas)

    predict1 = torch.argmax(result1.logits, dim=-1)
    predict2 = torch.argmax(result2.logits, dim=-1)

    correct1 += (predict1 == labels).sum()
    correct2 += (predict1 == labels).sum()

100%|██████████| 556/556 [01:21<00:00,  6.81it/s]


In [None]:

model_1.load_state_dict()