### 1.数据处理

In [1]:
import jieba
import torch
from torch import nn
from tqdm import tqdm 

#### 1,读取原始文件数据

In [2]:
file_name = "../../dataset/中文外卖评论数据集.csv"
X = []
y = []
with open(file=file_name, mode="r", encoding="utf8") as f:
    f.readline()
    for line in f:
        line = line.strip()
        line_y = line[:1]
        line_y = line_y.strip()
        
        line_x = ",".join(line.split(",")[1:])
        X.append(line_x)
        y.append(line_y)  

In [3]:
len(X)

11987

In [4]:
len(y)

11987

In [5]:
# X

In [6]:
# y

#### 2,数据转为np

In [7]:

import numpy as np
X = np.array(X)
y = np.array(y, dtype=int)

In [8]:
# X

In [46]:
y

array([1, 1, 1, ..., 0, 0, 0])

#### 3,构建字典

In [10]:

words = { word for sentence in X for word in jieba.lcut(sentence)}

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/mc/t06t7bmj32zflsscnj8kphfwjl80v8/T/jieba.cache
Loading model cost 0.348 seconds.
Prefix dict has been built successfully.


In [11]:
# words

In [12]:
len(words)

11008

#### 4,增加UKN，PAD

In [13]:
words.add("<UNK>")
words.add("<PAD>")

In [14]:
dict_len =len(words)

####  5,词与序号互转

In [15]:
word2idx = {word:idx for idx, word in enumerate(words)}
idx2word = {idx:word for word,idx in word2idx.items()}

In [16]:
# idx2word

In [17]:
# word2idx

####  6, 转词序号 与 对齐每句评论

In [18]:
# 批量最大长度
sentence_max  = 0
sentence_max = max([len(jieba.lcut(x)) for x in X])
sentence_max = 20 if sentence_max >=20 else sentence_max

In [19]:
sentence_max

20

In [20]:
X_sentences = []
for sentence in X:
    idx_setenct =  [word2idx.get(word, word2idx.get("<UNK>")) for word in sentence]
    idx_setenct =  idx_setenct + [word2idx.get("<PAD>")]*sentence_max
    idx_setenct =  idx_setenct[:sentence_max]
    X_sentences += [idx_setenct]
    

In [21]:
# X_sentences

#### 9，数据切分

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X_sentences, y, test_size=0.2,random_state=0)

In [47]:
y_train

array([0, 0, 0, ..., 0, 0, 1])

In [23]:
len(X_train), len(X_test), len(y_train), len(y_test)

(9589, 2398, 9589, 2398)

In [24]:
# X_train

In [25]:
# y_train

In [26]:
# X_test

In [27]:
# y_test

#### 10，批量打包数据

In [28]:
from torch.utils.data import Dataset,DataLoader
import torch

class MyDataSet(Dataset):
    def __init__(self,X,y):
        super().__init__()
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        
        x = torch.tensor(data=x, dtype=torch.long)
#         x = x.contiguous().view(-1)
        y = torch.tensor(data=y, dtype=torch.long)
        
        return x, y 
    

#[seq_len, batch_size]
# X_sentences_all_tenser = torch.tensor(data=X_sentences_all, dtype=torch.long).T
# X_sentences_all_tenser.shape

In [29]:
train_dataset = MyDataSet(X=X_train, y=y_train)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True)

test_dataset = MyDataSet(X=X_test, y=y_test)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=4, shuffle=True)

In [30]:
# for i,j in train_dataset:
#     print(i,j)

In [31]:
# test_dataset.__getitem__(0)

In [32]:
# for i,j in train_dataloader:
#     print(i,j)
#     print(i.shape)

### 2. 搭建模型

#### 2.1,模型定义

In [33]:
class Model(nn.Module):
    def __init__(self, dict_len=5000, embedding_dim=256, n_classes=2):
        super().__init__()
        # 词向量层
        self.embed = nn.Embedding(num_embeddings=dict_len, embedding_dim=embedding_dim)
        # 循环神经网络层
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=False)
        # 全连接层，转换输出
        self.out = nn.Linear(in_features=embedding_dim, out_features=n_classes)
    
    def forward(self, x):
#         print("输入原始x",x.shape)
        # [seq_len, batch_size] --> [seq_len, batch_size, embedding_dim]
        x = self.embed(x)
#         print("[seq_len, batch_size] --> [seq_len, batch_size, embedding_dim]",x.shape)
        
        
        # out: [seq_len, batch_size, embedding_dim]
        # hn: [ num_layers * num_directions, batch_size, embedding_dim]
        out, hn = self.rnn(x)
#         print("hn: [1, batch_size, embedding_dim]",hn.shape)
        
        
        # [seq_len, batch_size, embedding_dim] --> [batch_size, embedding_dim]
        x = torch.squeeze(input=hn, dim=0)
#         print("[seq_len, batch_size, embedding_dim] --> [batch_size, embedding_dim]",x.shape)
        
#         x = out[:, -1, :]  # 取最后一个时间步的输出
        #[batch_size, n_classes]
        x = self.out(x)
#         print("[batch_size, n_classes]",x.shape)
        
        return x

#### 2.2筹备训练

In [34]:

epochs = 100
learning_rate = 1e-3

model = Model(dict_len=dict_len, embedding_dim=256, n_classes=2)
loss_fn = nn.CrossEntropyLoss()
optimitzer = torch.optim.SGD(params=model.parameters(), lr=learning_rate)

In [35]:
model

Model(
  (embed): Embedding(11010, 256)
  (rnn): RNN(256, 256)
  (out): Linear(in_features=256, out_features=2, bias=True)
)

In [36]:
# W = torch.randint(low=0, high=15, size=(15, 4), dtype = torch.long)
# W.shape

In [37]:
# model(W)

### 3 训练过程

In [50]:
def get_acc(dataloader):
    # 模型设置为评估模式（BatchNorm, LayNorm,Dropout层
    # 在train模式和eval模式下，行为不一样的）
    model.eval()
    # 收集每个批量的损失
    accs = []
    # 构建一个无梯度的环境（底层不会默认自动创建计算图，节约资源）
    with torch.no_grad():
        for X, y in dataloader:
            y_pred = model(X.T)
            y_pred = y_pred.argmax(dim=-1)
#             print("acc_y_pred",y_pred)
            acc = (y_pred==y).to(dtype=torch.float32).mean().item()
#             print("acc_acc",acc)
            accs.append(acc)
    # 计算每个批量损失的平均值   
#     print(losses,"---------")
    final_acc = round(number=sum(accs) / len(accs),ndigits=5)
    return final_acc

In [51]:
def train():
    
    losses = []
    train_accs = []
    test_accs = []
    for epoch in range(epochs):
        model.train()
        
        for x, y in train_dataloader:
            
            # 正向传播
            y_pred = model(x.T)
            
            print("y_pred",y_pred.shape)

            print("y", y.shape)

            # 计算损失
#             print("x",x)
#             print("y_pred",y_pred)
#             print("y_pred.shape",y_pred.shape)
#             print("y.shape",y.shape)
            
            loss = loss_fn(y_pred, y)
            
            
            
            # 反向传播
            loss.backward()
            
            # 梯度优化
            optimitzer.step()
            
            # 梯度清空
            optimitzer.zero_grad()
            

            
        train_acc = get_acc(dataloader=train_dataloader)
        test_acc = get_acc(dataloader=test_dataloader)

        print(f"当前为第{epoch}轮，训练集准确率为{train_acc},测试集损失为{test_acc}")
        
        train_accs.append(train_acc)
        test_accs.append(test_acc)
        
    return train_accs, test_accs



In [52]:
train_accs, test_accs = train()
# train()

y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torch.Size([4])
y_pred torch.Size([4, 2])
y torc

KeyboardInterrupt: 

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.plot(train_accs)
plt.plot(test_accs)
plt.grid()