# TextCNN

In [1]:
# 引入对应的包
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
import numpy as np 
import gensim.models.word2vec as w2v
from sklearn.model_selection import train_test_split
import pandas as pd 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from numpy import array
from numpy import asarray
from numpy import zeros

Using TensorFlow backend.


In [2]:
# 定义 TextCNN网络结构
class textCNN(nn.Module):
    def __init__(self,args):
        super(textCNN,self).__init__()
        vocb_size = args['vocb_size']
        dim = args['dim']
        n_class = args['n_class']
        max_len = args['max_len']
        
        embedding_matrix = args['embedding_matrix']
        # 填入预训练好的词向量
        self.embeding = nn.Embedding(vocb_size,dim,_weight=embedding_matrix)
        self.conv1 = nn.Sequential(
                    nn.Conv2d(in_channels=1,out_channels=16,kernel_size=5,stride=1,padding=2),
                    nn.BatchNorm2d(16),
                    nn.ReLU(),
                    nn.MaxPool2d(2)
        )
        self.conv2 = nn.Sequential(
                    nn.Conv2d(in_channels=16,out_channels=32,kernel_size=5,stride=1,padding=2),
                    nn.BatchNorm2d(32),
                    nn.ReLU(),
                    nn.MaxPool2d(2)
        )
        self.conv3 = nn.Sequential(
                    nn.Conv2d(in_channels=32,out_channels=64,kernel_size=5,stride=1,padding=2),
                    nn.BatchNorm2d(64),
                    nn.ReLU(),
                    nn.MaxPool2d(2)
        )
        self.conv4 = nn.Sequential(
                    nn.Conv2d(in_channels=64,out_channels=128,kernel_size=5,stride=1,padding=2),
                    nn.BatchNorm2d(128),
                    nn.ReLU(),
                    nn.MaxPool2d(2)
        )
        self.out = nn.Linear(1536,n_class)
    def forward(self,x):
        x = self.embeding(x)
        #print('embeding: {}'.format(x))
        x = x.view(x.size(0),1,max_len,word_dim)
        x = self.conv1(x)
        #print('conv1: {}'.format(x))
        x = self.conv2(x)
        #print('conv2: {}'.format(x))
        x = self.conv3(x)
        #print('conv3: {}'.format(x))
        x = self.conv4(x)
        #print('conv4: {}'.format(x))
        x = x.view(x.size(0),-1)
        #print('final: {}'.format(x))
        output = self.out(x)
        return output 

In [3]:
# 生成词汇表
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
import pandas as pd
def clean_doc(doc):
    tokens = doc.split()
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

def add_doc_to_vocab(doc,vocab):
    tokens = clean_doc(doc)
    vocab.update(tokens)

file_train = pd.read_csv('C:/Users/WYX/Desktop/test/data/train.tsv', header=0, delimiter="\t", quoting=3)
text_train = ""
for i in file_train["Sentence"]:
    text_train += i
    text_train += '\n'

file_test = pd.read_csv('C:/Users/WYX/Desktop/test/data/test.tsv', header=0, delimiter="\t", quoting=3)
text_test = ""
for i in file_test["Sentence"]:
    text_test += i
    text_test += '\n'


vocab = Counter()
add_doc_to_vocab(text_train,vocab)
add_doc_to_vocab(text_test,vocab)
print(len(vocab))
print(vocab.most_common(50))

min_occurane = 0
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))


def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
save_list(tokens, 'C:/Users/WYX/Desktop/test/vocab.txt')

33167
[('I', 3404), ('tomorrow', 1874), ('u', 1016), ('going', 994), ('The', 961), ('may', 932), ('night', 819), ('go', 811), ('see', 718), ('time', 717), ('day', 704), ('Sunday', 673), ('Im', 659), ('amp', 652), ('Saturday', 637), ('like', 631), ('get', 619), ('tonight', 567), ('one', 543), ('Friday', 536), ('today', 477), ('game', 465), ('got', 439), ('back', 411), ('Day', 396), ('last', 393), ('think', 383), ('good', 378), ('want', 368), ('come', 368), ('know', 368), ('Monday', 361), ('still', 358), ('new', 346), ('Thursday', 336), ('make', 323), ('us', 322), ('show', 307), ('So', 295), ('RT', 294), ('If', 284), ('next', 281), ('first', 280), ('We', 276), ('dont', 270), ('love', 268), ('wait', 264), ('watch', 254), ('meet', 252), ('morning', 251)]
33167


In [4]:
# 生成随机word embedding
from string import punctuation
from os import listdir
from gensim.models import Word2Vec
import pandas as pd

def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def doc_to_clean_lines(doc, vocab):
    clean_lines = list()
    lines = doc.splitlines()
    for line in lines:
        tokens = line.split()
        table = str.maketrans('', '', punctuation)
        tokens = [w.translate(table) for w in tokens]
        tokens = [w for w in tokens if w in vocab]
        clean_lines.append(tokens)
    return clean_lines



vocab_filename = 'C:/Users/WYX/Desktop/test/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

file_train = pd.read_csv('C:/Users/WYX/Desktop/test/data/train.tsv', header=0, delimiter="\t", quoting=3)
doc = list()
for i in file_train["Sentence"]:
    doc += (doc_to_clean_lines(i,vocab))

file_test = pd.read_csv('C:/Users/WYX/Desktop/test/data/test.tsv', header=0, delimiter="\t", quoting=3)
for i in file_test["Sentence"]:
    doc += (doc_to_clean_lines(i,vocab))
sentences = doc
print('Total training sentences: %d' % len(sentences))


model = Word2Vec(sentences, size=100, window=5, workers=8, min_count=1)
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

filename = 'C:/Users/WYX/Desktop/test/embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

Total training sentences: 14876
Vocabulary size: 33167


In [18]:
#读入测试集并对 text 进行处理

# 文本处理函数
def clean_doc(raw_review):
    review_text = BeautifulSoup(raw_review).get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return( " ".join( meaningful_words ))

# 读入训练集 
file_train = pd.read_csv('C:/Users/WYX/Desktop/test/data/train.tsv', header=0, delimiter="\t", quoting=3)
doc = list()
for i in file_train["Sentence"]:
    doc.append(clean_doc(i))
train_docs = doc

# 训练集对应标签
y = file_train["Sentiment"]
ytrain = []
NumberofSize = file_train["Sentiment"].size
for i in list(range(0,NumberofSize)):
        if file_train["Sentiment"][i] == 'negative' :
            ytrain.append(0)
        if file_train["Sentiment"][i] == 'neutral' :
            ytrain.append(1)
        if file_train["Sentiment"][i] == 'positive' :
            ytrain.append(2)
ytrain = np.array(ytrain)

#读入测试集
file_test = pd.read_csv('C:/Users/WYX/Desktop/test/data/test.tsv', header=0, delimiter="\t", quoting=3)
doc = list()
for i in file_test["Sentence"]:
    doc.append(clean_doc(i))
test_docs = doc

# 读入词表
vocab_filename = 'C:/Users/WYX/Desktop/test/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
max_len = max([len(s.split()) for s in train_docs])

# 将输入的文本转换为对应的 word2vec
tokenizer = Tokenizer()
# 训练集
tokenizer.fit_on_texts(train_docs)
encoded_xtrain = tokenizer.texts_to_sequences(train_docs)
Xtrain = pad_sequences(encoded_xtrain,maxlen=max_len,padding='post')
# 测试集
tokenizer.fit_on_texts(test_docs)
encoded_xtest = tokenizer.texts_to_sequences(test_docs)
Xtest = pad_sequences(encoded_xtest,maxlen=max_len,padding='post')


In [None]:
# 使用个人训练的 embeding 
# 定义装载函数
def load_embedding(filename):
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    embedding = dict()
    for line in lines:
        parts = line.split()
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding
# 得到 embeding 层权重
def get_weight_matrix(embedding, vocab):
    vocab_size = len(vocab) + 1
    weight_matrix = zeros((vocab_size, 100))
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word,0)
        #print(weight_matrix[i])
    return weight_matrix

# 得到 embeding matrix
raw_embedding = load_embedding('C:/Users/WYX/Desktop/test/embedding_word2vec.txt')
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)

In [None]:
# 使用谷歌训练好的 embedding glove.300B.840.txt

def load_embedding(filename):
    file = open(filename,'rb')
    lines = file.readlines()
    file.close()
    embedding = dict()
    for line in lines:
        parts = line.split()
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

def get_weight_matrix(embedding, vocab):
    vocab_size = len(vocab) + 1
    weight_matrix = zeros((vocab_size, 100))
    for word, i in vocab.items():
        vector = embedding.get(word,0)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix

raw_embedding = load_embedding('C:/Users/WYX/Desktop/test/glove.840B.300d.txt')
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)

In [19]:
print(embedding_vectors.shape)

(28777, 100)


In [20]:
# 设置 textCNN参数
vocb_size = len(tokenizer.word_index) + 1
n_class = 3
nb_words = vocb_size 
word_dim = 100
args = {}
args['vocb_size'] = vocb_size 
args['max_len'] = max_len
args['n_class'] = n_class
args['dim'] = 100
args['embedding_matrix'] = torch.Tensor(embedding_vectors)
EPOCH = 10
test_epoch_size = 300
epoch_size = 1000 
LR = 0.000001
cnn = textCNN(args)
optimizer = torch.optim.Adam(cnn.parameters(),lr=LR)
loss_function = nn.CrossEntropyLoss()

In [22]:
for epoch in range(EPOCH):
    for i in range(0,(int)(len(Xtrain)/epoch_size)):
        b_x = Variable(torch.LongTensor(Xtrain[i*epoch_size:i*epoch_size+epoch_size]))
        b_y = Variable(torch.LongTensor(ytrain[i*epoch_size:i*epoch_size+epoch_size]))
        output = cnn(b_x) + 0.0001
        #print('b_y:\n')
        #print(b_y)
        #print('output:\n')
        #print(output)
        loss = loss_function(output,b_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('now loss is: {}'.format(loss))
        pred_y = torch.max(output,1)[1].data.squeeze()
        acc = (b_y == pred_y)
        acc = acc.numpy().sum()
        accuracy = acc / (b_y.size(0))
        print('now acc is: {}'.format(accuracy))

now loss is: 1.1255340576171875
now acc is: 0.397
now loss is: 1.1234824657440186
now acc is: 0.391
now loss is: 1.0989079475402832
now acc is: 0.441
now loss is: 1.1156374216079712
now acc is: 0.433
now loss is: 1.119517207145691
now acc is: 0.393
now loss is: 1.1422700881958008
now acc is: 0.355
now loss is: 1.1401242017745972
now acc is: 0.355


KeyboardInterrupt: 

In [None]:
# 由于个人机器能力有限 在保证 cnn 运行正确的基础上停止了长时间运行