In [100]:
import argparse
import time
import random
import glob
import multiprocessing as mp

import numpy as np

import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import HybridBlock
from mxnet.gluon.data import DataLoader

import gluonnlp as nlp


import d2l
from mxnet import gluon, init, nd
from mxnet.contrib import text
from mxnet.gluon import data as gdata, loss as gloss, nn, utils as gutils

import os
import csv
import multiprocessing as mp
from gluonnlp import Vocab, data
from mxnet.gluon.data import ArrayDataset, SimpleDataset
from nltk import word_tokenize
import pandas as pd


In [74]:
class QuoraDataset(ArrayDataset):
    """This dataset provides access to Quora insincere data competition"""

    def __init__(self, segment, root_dir="../input/"):
        self._root_dir = root_dir
        self._segment = segment
        self._segments = {
            # We may change the file path
            'train': '/Applications/files/classes_homework/Berkeley_ieor/STAT157/project/train.csv',
            'test': '/Applications/files/classes_homework/Berkeley_ieor/STAT157/project/test.csv'
        }

        super(QuoraDataset, self).__init__(self._read_data())

    def _read_data(self):
        file_path = os.path.join(self._root_dir, self._segments[self._segment])
        with open(file_path, mode='r', encoding='utf-8', newline='') as f:
            reader = csv.reader(f, delimiter=',', quotechar='"')
            # ignore 1st line - which is header
            data = [tuple(row) for i, row in enumerate(reader) if i > 0]
            for i in range(len(data)):
                data[i] = data[i][1:3]
                data[i] = list(data[i])
                data[i][1] = int(data[i][1])
        return data
    
    
    

In [75]:
train_dataset = QuoraDataset('train')

In [76]:
def preprocess_quora(data, vocab):  
    max_l = 200  # 将每条评论通过截断或者补0，使得长度变成200

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = d2l.get_tokenized_imdb(data)
    features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data])
    labels = nd.array([score for _, score in data])
    return features, labels




In [77]:
batch_size = 64
train_data, test_data = nlp.data.train_valid_split(train_dataset,valid_ratio = 0.3)
vocab = d2l.get_vocab_imdb(train_data)
train_iter = gdata.DataLoader(gdata.ArrayDataset(
    *preprocess_quora(train_data, vocab)), batch_size, shuffle=True)
test_iter = gdata.DataLoader(gdata.ArrayDataset(
    *preprocess_quora(test_data, vocab)), batch_size)





In [78]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X (64, 200) y (64,)


('#batches:', 14286)

In [79]:
class TextCNN(nn.Block):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels,
                 **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(len(vocab), embed_size)
        
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Dense(2)
       
        self.pool = nn.GlobalMaxPool1D()
        self.convs = nn.Sequential()  
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.add(nn.Conv1D(c, k, activation='relu'))
    
    def forward(self, inputs):
        
        embeddings = nd.concat(
            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
        
        embeddings = embeddings.transpose((0, 2, 1))
        
        encoding = nd.concat(*[nd.flatten(
            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
        
        outputs = self.decoder(self.dropout(encoding))
        return outputs


In [80]:
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
ctx = d2l.try_all_gpus()
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)
net.initialize(init.Xavier(), ctx=ctx)


In [81]:
glove_embedding = text.embedding.create(
    'glove', pretrained_file_name='glove.6B.100d.txt', vocabulary=vocab)
net.embedding.weight.set_data(glove_embedding.idx_to_vec)
net.constant_embedding.weight.set_data(glove_embedding.idx_to_vec)
net.constant_embedding.collect_params().setattr('grad_req', 'null')

In [82]:
len(train_data)

914285

In [83]:
count_1= 0
for lis in train_data:
    if lis[1] == 0:
        count_1 += 1
count_1

857721

In [84]:
len(train_data)

914285

In [101]:

def _get_batch(batch, ctx):
    """Return features and labels on ctx."""
    features, labels = batch
    if labels.dtype != features.dtype:
        labels = labels.astype(features.dtype)
    return (gutils.split_and_load(features, ctx),
            gutils.split_and_load(labels, ctx), features.shape[0])

In [102]:
def evaluate_recall(data_iter, net, ctx=[mx.cpu()]):
    """Evaluate accuracy of a model on the given data set."""
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    recall_sum, n = nd.array([0]),0
    for batch in data_iter:
        features, labels, _ = _get_batch(batch, ctx)
        for X, y in zip(features, labels):
            y = y.astype('float32')
            recall_sum += (net(X).argmax(axis=1) == 1).sum().copyto(mx.cpu())
            n += y.size
        recall_sum.wait_to_read()
    return recall_sum.asscalar() / count_1

In [103]:
def train_acc_recall(train_iter, test_iter, net, loss, trainer, ctx, num_epochs):
    """Train and evaluate a model."""
    print('training on', ctx)
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, train_recall_sum,n, m, start = 0.0, 0.0, 0.0,0, 0, time.time()
        for i, batch in enumerate(train_iter):
            Xs, ys, batch_size = _get_batch(batch, ctx)
            ls = []
            with autograd.record():
                y_hats = [net(X) for X in Xs]
                ls = [loss(y_hat, y) for y_hat, y in zip(y_hats, ys)]
            for l in ls:
                l.backward()
            trainer.step(batch_size)
            train_l_sum += sum([l.sum().asscalar() for l in ls])
            n += sum([l.size for l in ls])
            train_acc_sum += sum([(y_hat.argmax(axis=1) == y).sum().asscalar()
                                 for y_hat, y in zip(y_hats, ys)])
            m += sum([y.size for y in ys])
            train_recall_sum += sum([(y_hat.argmax(axis=1) == 1).sum().asscalar()
                                 for y_hat, y in zip(y_hats, ys)])
            
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        test_recall = evaluate_recall(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, train recall %.3f, test recall %.3f'
              'time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / m, test_acc, train_recall_sum / count_1, test_recall, 
                 time.time() - start))

In [104]:
lr, num_epochs = 0.001, 15
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gloss.SoftmaxCrossEntropyLoss()
train_acc_recall(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)



training on [cpu(0)]


KeyboardInterrupt: 