In [119]:
"""
TextCNN模型
1. setting arg
2. model
3. load data
4. train model


思考的逻辑是model（包括训练和预测），model需要什么参数，记下来，放置在arg，（是上到下吗）
在写程序的时候，应该要设置好arg，model，训练，出结果
"""

In [None]:
# 基本包的载入
import os
import re
import tqdm
import argparse
import time
import sys
import string
import operator
import random

# 基本数据处理包
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import nltk

# 特定的数据清洗包
import wordninja
from geopy.geocoders import Bing
import emoji

# torch, torchtext
from torch.nn import init
import torchtext.data as data
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from torchtext.vocab import Vectors

import torch
import torch.nn as nn
import torch.nn.functional as F

In [140]:
random.seed(1992)

def argsSetting():
    parser = argparse.ArgumentParser(description='TextCNN text classifier')
    # learning
    parser.add_argument('-lr', type=float, default=0.001, help='initial learning rate [default: 0.001]')
    parser.add_argument('-epochs', type=int, default=256, help='number of epochs for train [default: 256]')
    parser.add_argument('-batch-size', type=int, default=128, help='batch size for training [default: 128]')
    parser.add_argument('-log-interval', type=int, default=1,
                        help='how many steps to wait before logging training status [default: 1]')
    parser.add_argument('-test-interval', type=int, default=100,
                        help='how many steps to wait before testing [default: 100]')
    parser.add_argument('-save-dir', type=str, default='snapshot', help='where to save the snapshot')
    parser.add_argument('-early-stopping', type=int, default=1000,
                        help='iteration numbers to stop without performance increasing')
    parser.add_argument('-save-best', type=bool, default=True, help='whether to save when get best performance')
    
    # loading data
    parser.add_argument('-dataset-path', type=str, default= str(os.getcwd()), 
                    help='where the dataset locates [default: os.getcwd()]')
    parser.add_argument('-dataset-train', type=str, default= 'dataset_train.csv', 
                    help='the document of training set [default: dataset_train.csv]')
    parser.add_argument('-dataset-val', type=str, default= 'dataset_val.csv', 
                    help='the document of validation set [default: dataset_val.csv]')
    
    # model
    parser.add_argument('-dropout', type=float, default=0.5, help='the probability for dropout [default: 0.5]')
    parser.add_argument('-max-norm', type=float, default=3.0, help='l2 constraint of parameters [default: 3.0]')
    parser.add_argument('-embedding-dim', type=int, default=128, help='number of embedding dimension [default: 128]')
    parser.add_argument('-sentence-max-length', type=int, default= 128, help='max length of sentence as input [default: 128]')
    parser.add_argument('-min-freq', type=int, default= 5, help='minimal frequency allowed on vocabulary [default: 5]')
    parser.add_argument('-filter-num', type=int, default=100, help='number of each size of filter')
    parser.add_argument('-filter-sizes', type=str, default='3,4,5',
                        help='comma-separated filter sizes to use for convolution')
    parser.add_argument('-padding-sizes', type=str, default='1,2,2',
                        help='padding sizes to use for convolution')

    parser.add_argument('-static', type=bool, default=False, help='whether to use static pre-trained word vectors')
    parser.add_argument('-non-static', type=bool, default=False, help='whether to fine-tune static pre-trained word vectors')
    parser.add_argument('-multichannel', type=bool, default=False, help='whether to use 2 channel of word vectors')
    parser.add_argument('-pretrained-name', type=str, default='sgns.zhihu.word',
                        help='filename of pre-trained word vectors')
    parser.add_argument('-pretrained-path', type=str, default='pretrained', help='path of pre-trained word vectors')

    # device
    parser.add_argument('-device', type=int, default=-1, help='device to use for iterate data, -1 mean cpu [default: -1]')

    # option
    parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]')
    args = parser.parse_args(args=[])
    return args
    
# build args
args = argsSetting()

# fix args
args.dataset_path = 'D:/比赛/disaster/nlp-getting-started/'
args.dataset_train = 'dataset_train.csv'
args.dataset_val = 'dataset_val.csv'

args.min_freq = 5
args.batch_size = 64
args.sentence_max_length = 40
args.test_interval = 50
args.early_stopping = 500
args.pretrained_name = 'glove.840B.300d.txt'
args.pretrained_path = 'C:/Users/123/glove.840B.300d.txt/'
args.static = True

args.save_dir = 'D://比赛//disaster//nlp-getting-started//model_save//'

In [141]:
class TextCNN(nn.Module):
    def __init__(self, args):
        super(TextCNN, self).__init__()
        self.args = args

        class_num = args.class_num
        chanel_num = 1
        filter_num = args.filter_num
        filter_sizes = args.filter_sizes
        padding_sizes = args.padding_sizes

        vocabulary_size = args.vocabulary_size
        embedding_dimension = args.embedding_dim
        self.embedding = nn.Embedding(vocabulary_size, embedding_dimension)
        if args.static:
            self.embedding = self.embedding.from_pretrained(args.vectors, freeze=not args.non_static)
        if args.multichannel:
            self.embedding2 = nn.Embedding(vocabulary_size, embedding_dimension).from_pretrained(args.vectors)
            chanel_num += 1
        else:
            self.embedding2 = None
        self.convs = nn.ModuleList(
            [nn.Conv2d(chanel_num, filter_num, (size, embedding_dimension), padding = (padding, 0)) 
             for size, padding in zip(filter_sizes, padding_sizes)])
        self.dropout = nn.Dropout(args.dropout)
        self.fc = nn.Linear(len(filter_sizes) * filter_num, class_num)

    def forward(self, x):
        if self.embedding2:
            x = torch.stack([self.embedding(x), self.embedding2(x)], dim=1)
        else:
            x = self.embedding(x)
            x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

In [142]:
# training
def train(train_iter, val_iter, model, args):
    if args.cuda:
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    steps = 0
    best_acc = 0
    last_step = 0
    model.train()
    for epoch in range(1, args.epochs + 1):
        for batch in train_iter:
            feature, target = batch.text, batch.target
            if args.cuda:
                feature, target = feature.cuda(), target.cuda()
            optimizer.zero_grad()
            logits = model(feature.data.permute(1,0))
            loss = F.cross_entropy(logits, target)
            loss.backward()
            optimizer.step()
            steps += 1
            if steps % args.log_interval == 0:
                corrects = (torch.max(logits, 1)[1].view(target.size()).data == target.data).sum()
                train_acc = 100.0 * corrects / batch.batch_size
                sys.stdout.write(
                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps,
                                                                             loss.item(),
                                                                             train_acc,
                                                                             corrects,
                                                                             batch.batch_size))
            if steps % args.test_interval == 0:
                val_acc = eval(val_iter, model, args)
                if val_acc > best_acc:
                    best_acc = val_acc
                    last_step = steps
                    if args.save_best:
                        print('Saving best model, acc: {:.4f}%\n'.format(best_acc))
                        save(model, args.save_dir, 'best', steps)
                else:
                    if steps - last_step >= args.early_stopping:
                        print('\nearly stop by {} steps, acc: {:.4f}%'.format(args.early_stopping, best_acc))
                        raise KeyboardInterrupt

def eval(data_iter, model, args):
    model.eval()
    corrects, avg_loss = 0, 0
    for batch in data_iter:
        feature, target = batch.text, batch.target
        if args.cuda:
            feature, target = feature.cuda(), target.cuda()
        logits = model(feature.data.permute(1,0))
        loss = F.cross_entropy(logits, target)
        avg_loss += loss.item()
        corrects += (torch.max(logits, 1)
                     [1].view(target.size()).data == target.data).sum()
    size = len(data_iter.dataset)
    avg_loss /= size
    accuracy = 100.0 * corrects / size
    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss,
                                                                       accuracy,
                                                                       corrects,
                                                                       size))
    return accuracy

def save(model, save_dir, save_prefix, steps):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_prefix = os.path.join(save_dir, save_prefix)
    save_path = '{}_steps_{}.pt'.format(save_prefix, steps)
    torch.save(model.state_dict(), save_path)


In [143]:
"""
function to load data
"""
def load_word_vectors(model_name, model_path):
    vectors = Vectors(name=model_name, cache=model_path)
    return vectors

def load_dataset(text_field, label_field, args, **kwargs):
    # load train.csv and val.csv
    train, val = TabularDataset.splits(
        path= args.dataset_path, # the root directory where the data lies
        train= args.dataset_train, validation= args.dataset_val,
        format='csv',
        skip_header=True,
        fields=[("text", text_field), ("target", label_field)])
    # load vocabulary and embedding
    if args.static and args.pretrained_name and args.pretrained_path:
        vectors = load_word_vectors(args.pretrained_name, args.pretrained_path)
        text_field.build_vocab(train, val, vectors=vectors, min_freq = args.min_freq)
    else:
        text_field.build_vocab(train, val, min_freq = args.min_freq)
    label_field.build_vocab(train, val)
    # produce iterator
    train_iter, val_iter = BucketIterator.splits(
                        (train, val),
                         batch_sizes=(args.batch_size, args.sentence_max_length),
                         sort_key=lambda x: len(x.text), # field sorted by len
                         **kwargs)
    return train_iter, val_iter

print('Loading data...')
tokenize = lambda x: x.split()
y_tokenize = lambda y: int(y)
text_field = Field(sequential=True, tokenize=tokenize, lower=True)
label_field = Field(sequential=False, tokenize = y_tokenize, use_vocab=False)
train_iter, val_iter = load_dataset(text_field, label_field, args, device=-1, repeat=False, shuffle=True)


args.vocabulary_size = len(text_field.vocab)
if args.static:
    args.embedding_dim = text_field.vocab.vectors.size()[-1]
    args.vectors = text_field.vocab.vectors
if args.multichannel:
    args.static = True
    args.non_static = True
args.class_num = len(label_field.vocab)
args.cuda = args.device != -1 and torch.cuda.is_available()
args.filter_sizes = [int(size) for size in args.filter_sizes.split(',')]
args.padding_sizes = [int(padding) for padding in args.padding_sizes.split(',')]

    

Loading data...


The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


Parameters:
	BATCH_SIZE=64
	CLASS_NUM=3
	CUDA=False
	DATASET_PATH=D:/比赛/disaster/nlp-getting-started/
	DATASET_TRAIN=dataset_train.csv
	DATASET_VAL=dataset_val.csv
	DEVICE=-1
	DROPOUT=0.5
	EARLY_STOPPING=500
	EMBEDDING_DIM=300
	EPOCHS=256
	FILTER_NUM=100
	FILTER_SIZES=[3, 4, 5]
	LOG_INTERVAL=1
	LR=0.001
	MAX_NORM=3.0
	MIN_FREQ=5
	MULTICHANNEL=False
	NON_STATIC=False
	PADDING_SIZES=[1, 2, 2]
	PRETRAINED_NAME=glove.840B.300d.txt
	PRETRAINED_PATH=C:/Users/123/glove.840B.300d.txt/
	SAVE_BEST=True
	SAVE_DIR=D://比赛//disaster//nlp-getting-started//model_save//
	SENTENCE_MAX_LENGTH=40
	SNAPSHOT=None
	STATIC=True
	TEST_INTERVAL=50
	VOCABULARY_SIZE=2851


In [None]:
print('Parameters:')
for attr, value in sorted(args.__dict__.items()):
    if attr in {'vectors'}:
        continue
    print('\t{}={}'.format(attr.upper(), value))

In [144]:
text_cnn = TextCNN(args)
if args.snapshot:
    print('\nLoading model from {}...\n'.format(args.snapshot))
    text_cnn.load_state_dict(torch.load(args.snapshot))

if args.cuda:
    torch.cuda.set_device(args.device)
    text_cnn = text_cnn.cuda()
try:
    train(train_iter, val_iter, text_cnn, args)
except KeyboardInterrupt:
    print('Exiting from training early')


Batch[50] - loss: 0.464242  acc: 75.0000%(48/64)
Evaluation - loss: 0.011517  acc: 79.1721%(1205/1522) 

Saving best model, acc: 79.1721%

Batch[100] - loss: 0.421368  acc: 76.5625%(49/64)
Evaluation - loss: 0.011213  acc: 80.1577%(1220/1522) 

Saving best model, acc: 80.1577%

Batch[150] - loss: 0.417345  acc: 81.2500%(52/64)
Evaluation - loss: 0.011494  acc: 77.8581%(1185/1522) 

Batch[200] - loss: 0.312370  acc: 85.9375%(55/64)
Evaluation - loss: 0.010746  acc: 80.2891%(1222/1522) 

Saving best model, acc: 80.2891%

Batch[250] - loss: 0.367827  acc: 79.6875%(51/64)
Evaluation - loss: 0.011330  acc: 80.6176%(1227/1522) 

Saving best model, acc: 80.6176%

Batch[300] - loss: 0.232155  acc: 92.1875%(59/64)
Evaluation - loss: 0.010999  acc: 80.8804%(1231/1522) 

Saving best model, acc: 80.8804%

Batch[350] - loss: 0.159873  acc: 95.3125%(61/64)
Evaluation - loss: 0.011562  acc: 80.8147%(1230/1522) 

Batch[400] - loss: 0.230375  acc: 90.6250%(58/64))
Evaluation - loss: 0.012137  acc: 78.9