In [55]:
import sys
import os
import re

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.util import ngrams

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch as t
from torch import nn
from torch.utils import data
from torch.utils.data import DataLoader

stop_words = set(stopwords.words('english'))

### 清洗文本&&生成字符级n-gram特征
 - 先过滤掉html标记，再除掉一些标点，只保留英文字母和单引号
 - 用nltk的ngrams接口对每个词生成字符级n-gram特征添加到文本末尾

In [56]:
def clean_text(s):
    s = re.sub(r'<[^>]+>',' ',s)
    s = re.sub(r'[^a-zA-Z\']',' ',s)
    s = s.lower()
    s = s.split(" ")
    s = [w for w in s if not w in stop_words]
    s_ngram = s
    for ss in s:
        nlis = list(ngrams(ss,3))
        nlis = [tu[0]+tu[1]+tu[2] for tu in nlis]
        s_ngram = s_ngram + nlis
    return " ".join(s_ngram)

### 构造数据集
 - 根据train和predict分别生成含有label和不含label的dataset

In [57]:
class Movie(data.Dataset):
    def __init__(self, x, y=None, train=True):
        self.x = x
        self.y = y
        self.train = train
    def __getitem__(self, index):
        if self.train:
            return self.x[index],self.y[index]
        return self.x[index]
    
    def __len__(self):
        return len(self.x)

### 写FastText的结构
 - embedding层
 - 对词向量求均值
 - 全连接

In [58]:
embedding_size = 300
hidden_size = 32
class FastText(nn.Module):
    def __init__(self):
        super(FastText, self).__init__()
        self.embd = nn.Embedding(max_features,embedding_size)
        self.fc = nn.Sequential(
            nn.Linear(embedding_size,128),
            nn.Sigmoid(),
            nn.Linear(128,1),
            nn.Sigmoid()
        )
    def forward(self, x):
        x = self.embd(x)
        x = t.mean(x,1)
        x = self.fc(x)
        return x

### 训练函数

In [59]:
def train(model, trainloader, criterion, optimizer, save_model_path=r'model.pkl', epoch=2, display_iter=10, num_threads=8):
    t.set_num_threads(num_threads)
    
    for epoch_iter in range(epoch):
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):

            # 输入数据
            inputs, labels = data
            inputs = inputs.long()
            labels = labels.float()

            # 梯度清零
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criterion(outputs, labels)
            loss.backward()

            # 更新参数
            optimizer.step()

            # 打印log
            running_loss += loss.item()
            if i % display_iter == display_iter-1:
                print('[%d,%5d] loss %.3f' % (epoch_iter+1, i+1, running_loss / 10))
                running_loss = 0.0
    t.save(model.state_dict(), save_model_path) # 保存的是以字典 key - value pair 形式的数据，每一个参数对应着一个值 state_dict 状态字典 
    print('training finished!!!!!')

### 分批预测函数

In [60]:
def predict(model, testloader):
    res = t.Tensor()
    for data in testloader:
        res = t.cat((res, model(data.long()).squeeze()))
    return res

### 读入数据

In [49]:
data_path = r'E:\kaggle\movies'
train_data_path = os.path.join(data_path,'labeledTrainData.tsv')
test_data_path = os.path.join(data_path,'testData.tsv')
train_df = pd.read_csv(train_data_path,header=0,sep='\t')
test_df = pd.read_csv(test_data_path,header=0,sep='\t')
test_df['text'] = test_df.review.apply(clean_text)
train_df['text'] = train_df.review.apply(clean_text)

### 清洗，将词转换为序号值，把每个文本弄成等长

In [50]:
max_features = 10000  # 词汇表的大小
max_len = 640   # 每个文本的长度
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(train_df['text'])
list_tokenized_train = tokenizer.texts_to_sequences(train_df['text'])
list_tokenized_test = tokenizer.texts_to_sequences(test_df['text'])
train_x = pad_sequences(list_tokenized_train, maxlen=max_len)
test_x = pad_sequences(list_tokenized_test, maxlen=max_len)
train_y = train_df['sentiment']
print(train_x.shape,test_x.shape,train_y.shape)

(25000, 640) (25000, 640) (25000,)


### 定义损失函数，优化器，超参数，train和predict

In [51]:
train_dataset = Movie(train_x, train_y)
test_dataset = Movie(test_x,train=False)

net = FastText()
criterion = nn.BCELoss()
optimizer = t.optim.Adam(net.parameters())
epoch = 2
batch_size = 50
display_iter = 10
num_threads = 8
model_path = os.path.join(data_path,r'FastText.pkl')

trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=False)
testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, drop_last=False)

train(model=net, trainloader=trainloader, criterion=criterion, optimizer=optimizer, save_model_path=model_path, \
      epoch=epoch, display_iter=display_iter, num_threads=num_threads)
result = predict(net, testloader).data.numpy()

[1,   10] loss 0.695
[1,   20] loss 0.707
[1,   30] loss 0.694
[1,   40] loss 0.695
[1,   50] loss 0.694
[1,   60] loss 0.691
[1,   70] loss 0.693
[1,   80] loss 0.690
[1,   90] loss 0.693
[1,  100] loss 0.694
[1,  110] loss 0.689
[1,  120] loss 0.685
[1,  130] loss 0.682
[1,  140] loss 0.679
[1,  150] loss 0.682
[1,  160] loss 0.675
[1,  170] loss 0.673
[1,  180] loss 0.673
[1,  190] loss 0.668
[1,  200] loss 0.670
[1,  210] loss 0.660
[1,  220] loss 0.655
[1,  230] loss 0.644
[1,  240] loss 0.643
[1,  250] loss 0.632
[1,  260] loss 0.625
[1,  270] loss 0.601
[1,  280] loss 0.606
[1,  290] loss 0.596
[1,  300] loss 0.586
[1,  310] loss 0.557
[1,  320] loss 0.558
[1,  330] loss 0.539
[1,  340] loss 0.548
[1,  350] loss 0.520
[1,  360] loss 0.506
[1,  370] loss 0.533
[1,  380] loss 0.489
[1,  390] loss 0.497
[1,  400] loss 0.487
[1,  410] loss 0.481
[1,  420] loss 0.476
[1,  430] loss 0.457
[1,  440] loss 0.456
[1,  450] loss 0.427
[1,  460] loss 0.399
[1,  470] loss 0.426
[1,  480] los

### 输出到csv

In [53]:
result = np.array(result>0.5, dtype=np.int)
FastText_df = pd.DataFrame({'id':test_df['id'],'sentiment':result})
FastText_df.to_csv(os.path.join(data_path,'FastText_result_1.csv'),index=False)