# 变长输入的FastText实现

In [1]:
import sys
import os
import re

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.util import ngrams

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch as t
from torch import nn
from torch.utils import data
from torch.utils.data import DataLoader

stop_words = set(stopwords.words('english'))

Using TensorFlow backend.


In [2]:
def clean_text(s):
    s = re.sub(r'<[^>]+>',' ',s)
    s = re.sub(r'[^a-zA-Z\']',' ',s)
    s = s.lower()
    s = s.split(" ")
    s = [w for w in s if not w in stop_words]
    s_ngram = s
    for ss in s:
        nlis = list(ngrams(ss,3))
        nlis = [tu[0]+tu[1]+tu[2] for tu in nlis]
        s_ngram = s_ngram + nlis
    return " ".join(s_ngram)

In [3]:
class Movie(data.Dataset):
    def __init__(self, x, y=None, train=True):
        self.x = x
        self.y = y
        self.train = train
    def __getitem__(self, index):
        if self.train:
            return self.x[index],self.y[index]
        return self.x[index]
    
    def __len__(self):
        return len(self.x)

In [4]:
embedding_size = 300
hidden_size = 32
max_features = 10000  # 词汇表的大小
class FastText(nn.Module):
    def __init__(self):
        super(FastText, self).__init__()
        self.embd = nn.Embedding(max_features,embedding_size)
        self.fc = nn.Sequential(
            nn.Linear(embedding_size,128),
            nn.Sigmoid(),
            nn.Linear(128,1),
            nn.Sigmoid()
        )
    def forward(self, x):
        res = t.Tensor()
        for xi in x:
            xi = self.embd(xi)
            xi = t.mean(xi,0)
            xi = self.fc(xi)
            res = t.cat((res,xi))
        return res


In [5]:
def train(model, x, y, criterion, optimizer, save_model_path=r'model.pkl', batch_size=50,epoch=2, display_iter=10, num_threads=8):
    t.set_num_threads(num_threads)
    
    for epoch_iter in range(epoch):
        running_loss = 0.0
        for i in range(500):

            # 输入数据
            inputs = x[i*batch_size:i*batch_size+batch_size]
            labels = y[i*batch_size:i*batch_size+batch_size]
            inputs = [t.Tensor(i).long() for i in inputs]
            
            labels = t.Tensor(labels).float()

            # 梯度清零
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criterion(outputs, labels)
            loss.backward()

            # 更新参数
            optimizer.step()

            # 打印log
            running_loss += loss.item()
            if i % display_iter == display_iter-1:
                print('[%d,%5d] loss %.3f' % (epoch_iter+1, i+1, running_loss / 10))
                running_loss = 0.0
    t.save(model.state_dict(), save_model_path) # 保存的是以字典 key - value pair 形式的数据，每一个参数对应着一个值 state_dict 状态字典 
    print('training finished!!!!!')

In [6]:
def predict(model, x, batch_size=50):
    res = t.Tensor()
    for i in range(500):
        inputs = x[i*batch_size:i*batch_size+batch_size]
        inputs = [t.Tensor(i).long() for i in inputs]
        outputs = model(inputs)
        outputs = outputs.squeeze()
        res = t.cat((res,outputs))
    return res

In [11]:
data_path = r'E:\kaggle\movies'
train_data_path = os.path.join(data_path,'labeledTrainData.tsv')
test_data_path = os.path.join(data_path,'testData.tsv')
train_df = pd.read_csv(train_data_path,header=0,sep='\t')
test_df = pd.read_csv(test_data_path,header=0,sep='\t')
test_df['text'] = test_df.review.apply(clean_text)
train_df['text'] = train_df.review.apply(clean_text)

In [15]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(train_df['text'])
list_tokenized_train = tokenizer.texts_to_sequences(train_df['text'])
list_tokenized_test = tokenizer.texts_to_sequences(test_df['text'])

train_x = list_tokenized_train
test_x = list_tokenized_test
train_y = np.array(train_df['sentiment'])
print(len(train_x),len(test_x),type(train_y))

25000 25000 <class 'numpy.ndarray'>


In [35]:
net = FastText()
criterion = nn.BCELoss()
optimizer = t.optim.Adam(net.parameters())
epoch = 2
batch_size = 50
display_iter = 10
num_threads = 8
model_path = os.path.join(data_path,r'FastText.pkl')

train(net, train_x, train_y, criterion=criterion, optimizer=optimizer, save_model_path=model_path,
      epoch=epoch, display_iter=display_iter, num_threads=num_threads)
result = predict(net, test_x, batch_size).data.numpy()

[1,   10] loss 0.694
[1,   20] loss 0.695
[1,   30] loss 0.688
[1,   40] loss 0.688
[1,   50] loss 0.682
[1,   60] loss 0.682
[1,   70] loss 0.673
[1,   80] loss 0.670
[1,   90] loss 0.661
[1,  100] loss 0.648
[1,  110] loss 0.638
[1,  120] loss 0.628
[1,  130] loss 0.604
[1,  140] loss 0.592
[1,  150] loss 0.583
[1,  160] loss 0.554
[1,  170] loss 0.560
[1,  180] loss 0.511
[1,  190] loss 0.492
[1,  200] loss 0.500
[1,  210] loss 0.471
[1,  220] loss 0.443
[1,  230] loss 0.441
[1,  240] loss 0.435
[1,  250] loss 0.445
[1,  260] loss 0.396
[1,  270] loss 0.411
[1,  280] loss 0.372
[1,  290] loss 0.412
[1,  300] loss 0.396
[1,  310] loss 0.404
[1,  320] loss 0.346
[1,  330] loss 0.389
[1,  340] loss 0.423
[1,  350] loss 0.366
[1,  360] loss 0.327
[1,  370] loss 0.332
[1,  380] loss 0.356
[1,  390] loss 0.353
[1,  400] loss 0.361
[1,  410] loss 0.383
[1,  420] loss 0.364
[1,  430] loss 0.369
[1,  440] loss 0.393
[1,  450] loss 0.383
[1,  460] loss 0.363
[1,  470] loss 0.366
[1,  480] los

In [36]:
result.shape

(25000,)

In [37]:
result = np.array(result>0.5, dtype=np.int)
FastText_df = pd.DataFrame({'id':test_df['id'],'sentiment':result})
FastText_df.to_csv(os.path.join(data_path,'FastText_result_1.csv'),index=False)