In [7]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import random
import time
import multiprocessing as mp
import numpy as np

import mxnet as mx
from mxnet import nd, gluon, autograd

import gluonnlp as nlp

random.seed(123)
np.random.seed(123)
mx.random.seed(123)

In [2]:
class MeanPoolingLayer(gluon.HybridBlock):
    """A block for mean pooling of encoder features"""
    def __init__(self, prefix=None, params=None):
        super(MeanPoolingLayer, self).__init__(prefix=prefix, params=params)

    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
        """Forward logic"""
        # Data will have shape (T, N, C)
        masked_encoded = F.SequenceMask(data,
                                        sequence_length=valid_length,
                                        use_sequence_length=True)
        agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
                                    F.expand_dims(valid_length, axis=1))
        return agg_state


class SentimentNet(gluon.HybridBlock):
    """Network for sentiment analysis."""
    def __init__(self, dropout, prefix=None, params=None):
        super(SentimentNet, self).__init__(prefix=prefix, params=params)
        with self.name_scope():
            self.embedding = None # will set with lm embedding later
            self.encoder = None # will set with lm encoder later
            self.agg_layer = MeanPoolingLayer()
            self.output = gluon.nn.HybridSequential()
            with self.output.name_scope():
                self.output.add(gluon.nn.Dropout(dropout))
                self.output.add(gluon.nn.Dense(1, flatten=False))

    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
        encoded = self.encoder(self.embedding(data))  # Shape(T, N, C)
        agg_state = self.agg_layer(encoded, valid_length)
        out = self.output(agg_state)
        return out

In [3]:
dropout = 0
language_model_name = 'standard_lstm_lm_200'
pretrained = True
learning_rate, batch_size = 0.005, 32
bucket_num, bucket_ratio = 10, 0.2
epochs = 1
grad_clip = None
log_interval = 100

context = mx.cpu()

In [4]:
lm_model, vocab = nlp.model.get_model(name=language_model_name,
                                      dataset_name='wikitext-2',
                                      pretrained=pretrained,
                                      ctx=context,
                                      dropout=dropout)

In [5]:
net = SentimentNet(dropout=dropout)
net.embedding = lm_model.embedding
net.encoder = lm_model.encoder
net.hybridize()
net.output.initialize(mx.init.Xavier(), ctx=context)
print(net)

SentimentNet(
  (embedding): HybridSequential(
    (0): Embedding(33278 -> 200, float32)
  )
  (encoder): LSTM(200 -> 200, TNC, num_layers=2)
  (agg_layer): MeanPoolingLayer(
  
  )
  (output): HybridSequential(
    (0): Dropout(p = 0, axes=())
    (1): Dense(None -> 1, linear)
  )
)


In [13]:
# The tokenizer takes as input a string and outputs a list of tokens.
tokenizer = nlp.data.SpacyTokenizer('en')

# `length_clip` takes as input a list and outputs a list with maximum length 500.
length_clip = nlp.data.ClipSequence(500)

# Helper function to preprocess a single data point
def preprocess(x):
    data, label = x
    label = int(label > 5)
    # A token index or a list of token indices is
    # returned according to the vocabulary.
    data = vocab[length_clip(tokenizer(data))]
    return data, label

# Helper function for getting the length
def get_length(x):
    return float(len(x[0]))

OSError: SpaCy Model for the specified language="en" has not been downloaded. You need to check the installation guide in https://spacy.io/usage/models. Usually, the installation command should be `python -m spacy download en`.

In [10]:
trainfile ='C:/Users/nwang/Desktop/nlp/bert/data/q2_balance_113_5/train.tsv'
testfile='C:/Users/nwang/Desktop/nlp/bert/data/q2_balance_113_5/dev.tsv'
train_df = pd.read_csv(trainfile,sep='\t',names=['text','target1','target2','target3','target4'])
train_df.head(5)
test_df = pd.read_csv(testfile,sep='\t',names=['text','target1','target2','target3','target4'])
test_df.head(5)

Unnamed: 0,text,target1,target2,target3,target4
0,Setting up the new account was easy Is it poss...,1.0,0.0,0.0,0.0
1,Better customer service,0.0,0.0,0.0,1.0
2,It was so easy and quick,0.0,1.0,0.0,0.0
3,0,0.0,0.0,1.0,0.0
4,Clearly state options available given all cons...,0.0,0.0,0.0,1.0


Unnamed: 0,text,target1,target2,target3,target4
0,I think everything was just fine I was in and ...,0.0,1.0,0.0,0.0
1,None it was perfect Very easy,0.0,1.0,0.0,0.0
2,She was great,0.0,1.0,0.0,0.0
3,The service representative did all he she coul...,1.0,0.0,0.0,0.0
4,Everything was quick easy and personalized Thanks,0.0,1.0,0.0,0.0


In [11]:
train_dataset, test_dataset = [],[]
for x in zip(train_df.text.values,train_df.target4.values ):
    train_dataset.append(x)
    
for n in zip(test_df.text.values,test_df.target4.values ):
    test_dataset.append(n)

In [None]:
def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
        lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

# Doing the actual pre-processing of the dataset
train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
test_dataset, test_data_lengths = preprocess_dataset(test_dataset)