In [18]:
import collections
import argparse
import time
import random
import glob
import multiprocessing as mp

import numpy as np

import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import HybridBlock
from mxnet.gluon.data import DataLoader

import gluonnlp as nlp


import d2l
from mxnet import gluon, init, nd
from mxnet.contrib import text
from mxnet.gluon import data as gdata, loss as gloss, nn

import os
import csv
import multiprocessing as mp
from gluonnlp import Vocab, data
from mxnet.gluon.data import ArrayDataset, SimpleDataset
from nltk import word_tokenize
import pandas as pd
import re

In [2]:
class QuoraDataset(ArrayDataset):
    """This dataset provides access to Quora insincere data competition"""

    def __init__(self, segment, root_dir="../input/"):
        self._root_dir = root_dir
        self._segment = segment
        self._segments = {
            # We may change the file path
            'train': '/Applications/files/classes_homework/Berkeley_ieor/STAT157/project/train.csv',
            'test': '/Applications/files/classes_homework/Berkeley_ieor/STAT157/project/test.csv'
        }

        super(QuoraDataset, self).__init__(self._read_data())

    def _read_data(self):
        file_path = os.path.join(self._root_dir, self._segments[self._segment])
        with open(file_path, mode='r', encoding='utf-8', newline='') as f:
            reader = csv.reader(f, delimiter=',', quotechar='"')
            # ignore 1st line - which is header
            data = [tuple(row) for i, row in enumerate(reader) if i > 0]
            for i in range(len(data)):
                data[i] = data[i][1:3]
                data[i] = list(data[i])
                data[i][1] = int(data[i][1])
        return data
    
    
    

In [35]:
dataset = QuoraDataset('train')

In [36]:
def trans_data(dataset):
    train_num = int(0.7*len(dataset))
    data, label = [], []
    for pair in dataset:
        data.append(pair[0])
        label.append(pair[1])
    train_data = data[:train_num], label[:train_num]
    test_data = data[train_num:], label[train_num:]
    return train_data, test_data

train_data, test_data = trans_data(dataset)

In [45]:
def tokenize(sentences):
    return [line.split(' ') for line in sentences]

train_tokens = tokenize(train_data[0])
test_tokens = tokenize(test_data[0])

In [38]:
vocab = d2l.Vocab([tk for line in train_tokens for tk in line], min_freq=5)

In [47]:
max_len = 200
def pad(x):
    return x[:max_len] if len(x) > max_len else x + [vocab.unk] * (max_len - len(x))

train_features = nd.array([pad(vocab[line]) for line in train_tokens])
test_features = nd.array([pad(vocab[line]) for line in test_tokens])

train_set = gdata.ArrayDataset(train_features, train_data[1])
test_set = gdata.ArrayDataset(test_features, test_data[1])
train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)
test_iter = gdata.DataLoader(test_set, batch_size)

In [55]:
class TextCNN(nn.Block):
    def __init__(self, vocab_size, embed_size, kernel_sizes, num_channels,
                 **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # The embedding layer does not participate in training
        self.constant_embedding = nn.Embedding(vocab_size, embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Dense(2)
        # The max-over-time pooling layer has no weight, so it can share an
        # instance
        self.pool = nn.GlobalMaxPool1D()
        # Create multiple one-dimensional convolutional layers
        self.convs = nn.Sequential()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.add(nn.Conv1D(c, k, activation='relu'))

    def forward(self, inputs):
        # Concatenate the output of two embedding layers with shape of
        # (batch size, number of words, word vector dimension) by word vector
        embeddings = nd.concat(
            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
        # According to the input format required by Conv1D, the word vector
        # dimension, that is, the channel dimension of the one-dimensional
        # convolutional layer, is transformed into the previous dimension
        embeddings = embeddings.transpose((0, 2, 1))
        # For each one-dimensional convolutional layer, after max-over-time
        # pooling, an NDArray with the shape of (batch size, channel size, 1)
        # can be obtained. Use the flatten function to remove the last
        # dimension and then concatenate on the channel dimension
        encoding = nd.concat(*[nd.flatten(
            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
        # After applying the dropout method, use a fully connected layer to
        # obtain the output
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [56]:
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
ctx = d2l.try_all_gpus()
net = TextCNN(len(vocab), embed_size, kernel_sizes, nums_channels)
net.initialize(init.Xavier(), ctx=ctx)

In [57]:
glove_embedding = text.embedding.create(
    'glove', pretrained_file_name='glove.6B.100d.txt')
embeds = glove_embedding.get_vecs_by_tokens(vocab.idx_to_token)
net.embedding.weight.set_data(embeds)
net.embedding.collect_params().setattr('grad_req', 'null')

### Train

In [58]:
lr, num_epochs = 0.001, 5
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gloss.SoftmaxCrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)


training on [cpu(0)]


KeyboardInterrupt: 

### Save model parameters

In [67]:
file_name = "textCNN.params"
net.save_parameters(file_name)

### Predict

In [3]:
def predict_textCNN(sentence):
    sentence_ = re.sub("[^a-zA-Z]", " ",sentence)
    sentence_new = sentence_.split()
    sentence_new = nd.array(vocab.to_indices(sentence_new), ctx=d2l.try_gpu())
    label = nd.argmax(net(sentence_new.reshape((1, -1))), axis=1)
    return '1' if label.asscalar() == 1 else '0'


In [62]:
def testCNN(sentence):
    embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
    net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

    net.load_parameters('textCNN.params', ctx=mx.cpu())

    sentence_ = re.sub("[^a-zA-Z]", " ", sentence)
    sentence_new = sentence_.split()
    sentence_new = nd.array(vocab.to_indices(sentence_new), ctx=mx.cpu())
    label = nd.argmax(net(sentence_new.reshape((1, -1))), axis=1)
    return 'Bad guy detected!! Report to us?' if label.asscalar() == 1 else 'Not bad :)'