In [3]:
import pandas as pd
import ast
import d2l
from sklearn.model_selection import train_test_split
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn, utils as gutils
from mxnet import gluon, init, nd
from mxnet.contrib import text
import os
import tarfile

In [4]:
dat = pd.read_csv("tokenized_tweets.csv")

In [5]:
dat['party'][dat['party'] == -1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [6]:
dat = dat.drop("Unnamed: 0", axis=1)

In [7]:
dat["string"] = dat["filtered_w_stop_text"].apply(lambda x: ast.literal_eval(x))

In [8]:
dat["len"] = dat.string.apply(lambda x: len(x))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(dat["string"], dat['party'], test_size=0.2, random_state=42)

In [10]:
train_sentences = [' '.join(s) for s in X_train]
test_sentences = [' '.join(s) for s in X_test]

In [11]:
def tokenize(sentences):
    return [line.split(' ') for line in sentences]

train_tokens = tokenize(train_sentences)
test_tokens = tokenize(test_sentences)

In [12]:
vocab = d2l.Vocab([tk for line in train_tokens for tk in line], min_freq=1)

In [13]:
max_len = max(dat.len)

In [14]:
def pad(x):
    if len(x) > max_len:
        return x[:max_len]
    else:
        return x + [vocab.unk] * (max_len - len(x))

train_features = nd.array([pad(vocab[line]) for line in train_tokens])
test_features = nd.array([pad(vocab[line]) for line in test_tokens])

In [15]:
batch_size = 64
train_set = gdata.ArrayDataset(train_features, nd.array(y_train))
test_set = gdata.ArrayDataset(test_features, nd.array(y_test))
train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)
test_iter = gdata.DataLoader(test_set, batch_size)

In [22]:
class BiRNN(nn.Block):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # Set Bidirectional to True to get a bidirectional recurrent neural
        # network
        self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers,
                                bidirectional=True, input_size=embed_size)
        self.decoder = nn.Dense(2)

    def forward(self, inputs):
        # The shape of inputs is (batch size, number of words). Because LSTM
        # needs to use sequence as the first dimension, the input is
        # transformed and the word feature is then extracted. The output shape
        # is (number of words, batch size, word vector dimension).
        embeddings = self.embedding(inputs.T)
        # Since the input (embeddings) is the only argument passed into
        # rnn.LSTM, it only returns the hidden states of the last hidden layer
        # at different time step (outputs). The shape of outputs is
        # (number of words, batch size, 2 * number of hidden units).
        outputs = self.encoder(embeddings)
        # Concatenate the hidden states of the initial time step and final
        # time step to use as the input of the fully connected layer. Its
        # shape is (batch size, 4 * number of hidden units)
        encoding = nd.concat(outputs[0], outputs[-1])
        #seq = self.sequential(encoding)
        #mlp_out = self.mlplayer(seq)
        #outs = self.decoder(mlp_out)
        outs = self.decoder(encoding)
        return outs

In [23]:
embed_size, num_hiddens, num_layers, ctx = 200, 100, 4, d2l.try_all_gpus()
net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers)
net.initialize(init.Xavier(), ctx=ctx)

In [18]:
glove_embedding = text.embedding.create(
    'glove', pretrained_file_name='glove.twitter.27B.200d.txt')

Downloading /home/ubuntu/.mxnet/embeddings/glove/glove.twitter.27B.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/glove/glove.twitter.27B.zip...


In [19]:
embeds = glove_embedding.get_vecs_by_tokens(vocab.idx_to_token)

In [24]:
net.embedding.weight.set_data(embeds)
net.embedding.collect_params().setattr('grad_req', 'null')

In [None]:
lr, num_epochs = 0.1, 20
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gloss.SoftmaxCrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)

training on [gpu(0)]


In [None]:
file_name = "net.params"
net.save_parameters(file_name)