<a href="https://colab.research.google.com/github/zhihong1224/RNN_demo/blob/master/Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment

In [2]:
!gdown --id '1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8' --output data.zip
!unzip data.zip
!ls

Downloading...
From: https://drive.google.com/uc?id=1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8
To: /content/data.zip
45.1MB [00:00, 67.7MB/s]
Archive:  data.zip
  inflating: training_label.txt      
  inflating: testing_data.txt        
  inflating: training_nolabel.txt    
data.zip     testing_data.txt	 training_nolabel.txt
sample_data  training_label.txt


In [0]:
import warnings
warnings.filterwarnings('ignore')

# Utils

In [0]:
import torch
import numpy as np
import pandas as pd
from torch import nn,optim
import torch.nn.functional as F

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def loading_training_data(path='training_label.txt'):
  if 'training_label' in path:
    with open(path,'r') as f:
      lines=f.readlines()
      lines=[line.strip('\n').split(' ') for line in lines]
    x=[line[2:] for line in lines]
    y=[line[0] for line in lines]
    return x,y
  else:
    with open(path,'r') as f:
      lines=f.readlines()
      x=[line.strip('\n').split(' ') for line in lines]
    return x
def load_testing_data(path='testing_data'):
  with open(path,'r') as f:
    lines=f.readlines()
    X=[''.join(line.strip('\n').split(',')[1:]).strip() for line in lines]
    X=[sen.split(' ') for sen in X]
  return X
def evaluation(outputs,labels):
  outputs[outputs>=0.5]=1
  outputs[outputs<0.5]=0
  correct=torch.sum(torch.eq(outputs,labels)).item()
  return correct

In [0]:
train_data,train_label=loading_training_data()

In [6]:
print(len(train_data),len(train_data[100]),train_data[2])
print(len(train_label),train_label[100])
print(max([len(line) for line in train_data]))

200000 16 ['i', 'wish', 'i', 'could', 'go', 'and', 'see', 'duffy', 'when', 'she', 'comes', 'to', 'mamaia', 'romania', '.']
200000 0
39


# Data Preprocess

In [0]:
class Preprocess():
  def __init__(self,sentences):
    # sentences:train_data list of reviews
    self.sentences=[[word.lower() for word in line] for line in sentences]
    # to get word_to_idx,idx_to_word,vocab_size
    words=[word for line in self.sentences for word in line]
    set_words=set(words)
    self.idx_to_word=[word for word in set_words]
    self.idx_to_word.append('<pad>')
    self.word_to_idx={word:idx for idx,word in enumerate(self.idx_to_word)}
    self.word_to_idx['<pad>']=len(self.word_to_idx)
    self.vocab_size=len(self.idx_to_word)
  def get_corpus(self,seq_len,device=device):
    # 获取传入模型的数据，(batch_size,seq_len)
    results=[]
    L=len(self.sentences)
    for i in range(L):
      line=self.sentences[i]
      if len(line)>seq_len:
        temp=[self.word_to_idx[word] for word in line[:seq_len]]
      else:
        temp=[self.word_to_idx[word] for word in line]+[self.word_to_idx['<pad>']]*(seq_len-len(line))
      results.append(temp)
    return torch.tensor(results)
  def labels_to_tensor(self,y):
    # y:list of labels
    y=[int(label) for label in y]
    return torch.tensor(y)

# Dataset

In [53]:
from torch.utils.data import TensorDataset,DataLoader
train_data,train_y=loading_training_data()
preprocess=Preprocess(train_data)
vocab_size=preprocess.vocab_size
print(len(train_y))

seq_len=20
train_all_corpus=preprocess.get_corpus(seq_len)
train_all_labels=preprocess.labels_to_tensor(train_y)
print(train_all_labels.shape)

train_all=len(train_data)
portion=0.2
valid_num=int(train_all*portion)

valid_corpus=train_all_corpus[:valid_num,:]
valid_labels=train_all_labels[:valid_num]
valid_dataset=TensorDataset(valid_corpus,valid_labels)

train_corpus=train_all_corpus[valid_num:,:]
train_labels=train_all_labels[valid_num:]
train_dataset=TensorDataset(train_corpus,train_labels)

batch_size=400

train_iter=DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=4)
valid_iter=DataLoader(valid_dataset,batch_size=batch_size,shuffle=True,num_workers=4)

200000
torch.Size([200000])


In [54]:
for X,Y in train_iter:
  print(X.shape,Y.shape)
  break

torch.Size([400, 20]) torch.Size([400])


# 模型

In [0]:
class Sentiment(nn.Module):
  def __init__(self,vocab_size,embed_size,hidden_size):
    super(Sentiment,self).__init__()
    self.embed=nn.Embedding(vocab_size,embed_size)
    self.lstm=nn.LSTM(embed_size,hidden_size,num_layers=2,batch_first=True)
    self.dropout=nn.Dropout(0.5)
    self.fc=nn.Linear(hidden_size,1)
    self.sigmoid=nn.Sigmoid()
  def forward(self,x):
    # x:(batch_size,seq_len)
    embed=self.embed(x)   # (batch_size,seq_len,embed_size)
    out,_=self.lstm(embed,None)  # (batch_size,num_layers,hidden_size)
    out=out[:,-1,:]   # (batch_size,hidden_size)
    out=self.dropout(out) # (batch_size,hidden_size)
    out=self.sigmoid(self.fc(out)) # (batch_size,1)
    return out

# 训练

In [0]:
def train(model,num_epochs,lr,train_iter,valid_iter,print_every=100):
  model=model.to(device)
  criterion=nn.BCELoss()
  optimizer=optim.Adam(model.parameters(),lr=lr)

  for epoch in range(num_epochs):
    model.train()
    train_loss,train_acc,n=0.0,0.0,0
    for i,(X,Y) in enumerate(train_iter):
      X=X.to(device)
      Y=Y.to(device)
      y_pred=model(X).squeeze()
      loss=criterion(y_pred,Y.float())
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      train_loss+=loss.item()
      train_acc+=evaluation(y_pred,Y.float())
      n+=Y.shape[0]
      if i%print_every==0:
        print('[ Epoch{}: {}/{}] loss:{:.3f} acc:{:.3f}'.\
            format(epoch+1,i,len(train_iter),loss.item(),train_acc/n))
    valid_acc=evaluate_acc(model,valid_iter)
    print('\nTrain | Loss:{:.5f} Train_Acc:{:.3f} Valid_Acc:{:.3f}'\
          .format(train_loss/len(train_iter),train_acc/n,valid_acc))


In [0]:
def evaluate_acc(model,valid_iter):
  model.eval()
  acc,n=0.0,0
  with torch.no_grad():
    for X,Y in valid_iter:
      X=X.to(device)
      Y=Y.to(device)
      y_pred=model(X).squeeze()
      acc+=evaluation(y_pred,Y)
      n+=Y.shape[0]
  model.train()
  return acc/n

In [71]:
num_epochs,lr=10,0.001
model=Sentiment(vocab_size,1024,256)
train(model,num_epochs,lr,train_iter,valid_iter)

[ Epoch1: 0/400] loss:0.692 acc:0.512
[ Epoch1: 100/400] loss:0.494 acc:0.695
[ Epoch1: 200/400] loss:0.512 acc:0.729
[ Epoch1: 300/400] loss:0.451 acc:0.745

Train | Loss:0.49838 Train_Acc:0.755 Valid_Acc:0.790
[ Epoch2: 0/400] loss:0.383 acc:0.848
[ Epoch2: 100/400] loss:0.430 acc:0.827
[ Epoch2: 200/400] loss:0.374 acc:0.826
[ Epoch2: 300/400] loss:0.379 acc:0.826

Train | Loss:0.38884 Train_Acc:0.825 Valid_Acc:0.797
[ Epoch3: 0/400] loss:0.280 acc:0.902
[ Epoch3: 100/400] loss:0.316 acc:0.879
[ Epoch3: 200/400] loss:0.315 acc:0.877
[ Epoch3: 300/400] loss:0.307 acc:0.876

Train | Loss:0.29357 Train_Acc:0.875 Valid_Acc:0.790
[ Epoch4: 0/400] loss:0.211 acc:0.920
[ Epoch4: 100/400] loss:0.154 acc:0.928
[ Epoch4: 200/400] loss:0.195 acc:0.925
[ Epoch4: 300/400] loss:0.185 acc:0.922

Train | Loss:0.19810 Train_Acc:0.921 Valid_Acc:0.782
[ Epoch5: 0/400] loss:0.138 acc:0.960
[ Epoch5: 100/400] loss:0.111 acc:0.962
[ Epoch5: 200/400] loss:0.146 acc:0.959
[ Epoch5: 300/400] loss:0.140 acc: