In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import json
import time

In [2]:
class Dictionary:
    def __init__(self):
        # 0 reserved for padding char
        # 1 reserved for unknown char
        self.num_chars = 2
        self.dict = {}
        
    def add_string(self, string):
        for c in string:
            self.add_char(c)
            
    def add_char(self, c):
        if c not in self.dict:
            self.dict[c] = self.num_chars
            self.num_chars += 1
    
    def find_char(self, c):
        if c in self.dict:
            return self.dict[c]
        else:
            return 1
    
    # output shape: (len(ss))
    def prepare_input_sequence(self, s, seq_len=None):
        if seq_len is None:
            return torch.tensor([self.find_char(c) for c in s])
        else:
            t = torch.zeros([seq_len], dtype=torch.long)
            t[:len(s)] = torch.tensor([self.find_char(c) for c in s])
            return t

    # output shape: (seq_len)
    def prepare_tag_sequence(self, seq_len, entity_beg, entity_len):
        t = torch.zeros(seq_len)
        t[entity_beg:entity_beg+entity_len] = 1
        return t
    
    # output type: (in, targ), in.size() = targ.size() = (len(s))
    def prepare_example(self, s, entity_beg, entity_len):
        inputs = self.prepare_input_sequence(s)
        targets = self.prepare_tag_sequence(len(s), entity_beg, entity_len)
        return inputs, targets
    
    # input type:  array of [title, entity_beg, entity_len]
    # output type: (in, targ), in.size() = targ.size() = (max(len(records[0])), len(records))
    def prepare_examples(self, records):
        max_seq_len = max(len(r[0]) for r in records)
        input_tensor = torch.zeros([max_seq_len, len(records)], dtype=torch.long)
        target_tensor = torch.zeros([max_seq_len, len(records)])
        for i,r in enumerate(records):
            input_tensor[:, i] = self.prepare_input_sequence(r[0], max_seq_len)
            target_tensor[:, i] = self.prepare_tag_sequence(max_seq_len, r[1], r[2])
        return input_tensor, target_tensor
        
    
    def save(self, file):
        with open(file, 'w') as f:
            json.dump({ 'n': self.num_chars, 'd': self.dict }, f)
    def load(self, file):
        with open(file) as f:
            obj = json.load(f)
            self.num_chars = obj['n']
            self.dict = obj['d']

In [3]:
import pandas as pd

corpus = pd.read_csv("../../corpus/company-news.csv")

In [4]:
vocab = Dictionary()
if os.path.exists('vocab.db'):
    vocab.load('vocab.db')
else:
    for news_title in corpus['title']:
        vocab.add_string(news_title)
    vocab.save('vocab.db')
vocab.num_chars

5328

In [5]:
len(vocab.dict)

5326

In [24]:
torch.manual_seed(1)

class SeqClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size=100, bidirectional=True,
                 num_layers=1, hidden_size=100):
        super(self.__class__, self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.num_dir = 2 if bidirectional else 1
        
        self.word_embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(
            input_size=embedding_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=bidirectional
        )
        self.hidden_to_hidden = nn.Linear(hidden_size * self.num_dir, hidden_size * self.num_dir)
        self.hidden_to_tag = nn.Linear(hidden_size * self.num_dir, 1)
        self.hidden = self.init_hidden()
        
    def init_hidden(self, batch_size=1):
        return (torch.zeros(self.num_layers * self.num_dir, batch_size, self.hidden_size),
                torch.zeros(self.num_layers * self.num_dir, batch_size, self.hidden_size))
    
    # input shape: (seq_len, batch_size)
    def forward(self, seqs):
        batch_size = seqs.size()[1]
        seq_len = seqs.size()[0]
        
        self.hidden = self.init_hidden(batch_size)
        embeds = self.word_embedding(seqs).view(seq_len, batch_size, self.embedding_size)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        hidden = F.relu(self.hidden_to_hidden(F.relu(F.dropout(lstm_out, p=0.2))))
        tag_space = self.hidden_to_tag(F.dropout(hidden, p=0.2)).view(seq_len, batch_size)
        return tag_space

In [25]:
# 
model = torch.load(f"training-34-1760.6831183843315.model")
torch.save(model.state_dict(), 'mytraining.pt')
model = SeqClassifier(vocab_size=vocab.num_chars, embedding_size=32, hidden_size=64, num_layers=2)
model.load_state_dict(torch.load('mytraining.pt'))
loss_function = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

In [26]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

def pprint_tagged_sentence(model, sentence):
    from IPython.display import display, HTML

    s = vocab.prepare_input_sequence(sentence).view(-1, 1)
    pred = torch.sigmoid(model(s)).view(-1).tolist()
    def format_c(c, p):
        if p < 0.1:
            return f'<span style="color: #f20" title="{p}">{c}</span>'
        elif 0.1 <= p < 0.5:
            return f'<span style="color: #a80" title="{p}">{c}</span>'
        elif 0.5 <= p < 0.9:
            return f'<span style="color: #480" title="{p}">{c}</span>'
        else:
            return f'<span style="color: #2f0" title="{p}">{c}</span>'
    display(HTML(''.join([format_c(c, p) for p,c in zip(pred, sentence)])))
    
pprint_tagged_sentence(model, corpus.loc[200]['title'])

In [27]:
    example = corpus[99:100][['title', 'beg', 'length']]
    (inputs, targets) = vocab.prepare_examples(example.values.tolist())
    print(example)
    print('input:', [*inputs.size()], inputs.view(-1))
    tag_scores = model(inputs.view(-1,1))
    print('prediction:', [*tag_scores.size()], torch.sigmoid(tag_scores).view(-1))
    print('target', [*targets.size()], targets.view(-1))
    print('loss', loss_function(tag_scores.view(-1), targets.view(-1)).item())

                             title  beg  length
99  一点资讯获《互联网新闻信息服务管理规定》施行后首张新闻许可证    0       4
input: [30, 1] tensor([106, 160,  18,  55, 132, 178, 677, 560, 162, 226, 678,   3, 564, 189,
        190, 679, 680, 335, 681, 180, 682,  81, 338, 412, 683, 226, 678, 684,
        685, 546])
prediction: [30, 1] tensor([0.3490, 0.6412, 0.2448, 0.6518, 0.0067, 0.0311, 0.1744, 0.2229, 0.1829,
        0.0399, 0.1212, 0.1354, 0.1327, 0.0614, 0.0072, 0.0356, 0.1756, 0.0228,
        0.0031, 0.0049, 0.0086, 0.0061, 0.0107, 0.0089, 0.0031, 0.0037, 0.0030,
        0.0096, 0.0005, 0.0118], grad_fn=<ViewBackward>)
target [30, 1] tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
loss 0.16238780319690704


In [28]:
# total 938838
training_size = 100000
testing_size = 200
samples = ['中信重工：拟设立两家子公司 注册资本各5000万元',
           '做工业气体的“氨监测师”,「海尔欣」光电用中红外技术实现高精氨逃逸监测',
           '独家丨获数百万元天使轮融资，芳汀思致力于将素质教育回归本源']
total_epoch = 100
batch_size = 10

last_printed = time.time()


for epoch in range(total_epoch):
    training_loss = 0.0
    
    for s in samples:
        pprint_tagged_sentence(model, s)
    
    for i in range(0, min(len(corpus), training_size), batch_size):
        # initialize model parameters
        model.zero_grad()
        model.hidden = model.init_hidden()
        
        # prepare examples
        dataframe = corpus[i:i+batch_size][['title', 'beg', 'length']]
        (inputs, targets) = vocab.prepare_examples(dataframe.values.tolist())
        
        # forward pass
        tag_scores = model(inputs)
        # print(dataframe, inputs, tag_scores)
        # compute and accumulate loss function
        loss = loss_function(tag_scores, targets)
        training_loss += loss.item()
        
        # backward pass
        loss.backward()
        optimizer.step()
        
        # print stats every 5 secs
        
        if int(i / batch_size) % 20 == 0:
            print(f"E:%03d/I:%05d - loss: %.06f" % (epoch, i, loss.item()))
        if time.time() - last_printed > 10:
            for s in samples:
                pprint_tagged_sentence(model, s)
            last_printed = time.time()

    print(f"ending epoch %03d, loss: %.08f" % (epoch, training_loss))
    
    torch.save(model, f"training-{epoch}-{training_loss}.model")


E:000/I:00000 - loss: 0.158783
E:000/I:00200 - loss: 0.150213
E:000/I:00400 - loss: 0.212268
E:000/I:00600 - loss: 0.182899
E:000/I:00800 - loss: 0.241765


E:000/I:01000 - loss: 0.121148
E:000/I:01200 - loss: 0.241238
E:000/I:01400 - loss: 0.242223


E:000/I:01600 - loss: 0.204247
E:000/I:01800 - loss: 0.200590
E:000/I:02000 - loss: 0.101381
E:000/I:02200 - loss: 0.157299


E:000/I:02400 - loss: 0.168960
E:000/I:02600 - loss: 0.159740
E:000/I:02800 - loss: 0.141414
E:000/I:03000 - loss: 0.164306


E:000/I:03200 - loss: 0.117698
E:000/I:03400 - loss: 0.146287
E:000/I:03600 - loss: 0.246683
E:000/I:03800 - loss: 0.345544


E:000/I:04000 - loss: 0.184913
E:000/I:04200 - loss: 0.139421
E:000/I:04400 - loss: 0.114850


E:000/I:04600 - loss: 0.153545
E:000/I:04800 - loss: 0.137067
E:000/I:05000 - loss: 0.235778
E:000/I:05200 - loss: 0.202105


KeyboardInterrupt: 

In [None]:
with torch.no_grad():
    (inputs, tags) = vocab.prepare_example(*corpus.loc[1][['title', 'beg', 'length']])
    print([*corpus.loc[1][['title', 'beg', 'length']]])
    print('input:', [*inputs.size()], inputs)
    tag_scores = model(inputs).view(-1)
    print('prediction:', [*tag_scores.size()], tag_scores.view(-1))
    print('target', [*tags.size()], tags.view(-1))
    print('loss', loss_function(tag_scores.view(1, -1), tags.view(1, -1)))

In [None]:
torch.multiprocessing.get_sharing_strategy()

In [None]:
torch.multiprocessing.get_all_sharing_strategies()

In [None]:
from IPython.display import display, HTML

display(HTML('<span style="color: #a80">helo</span>'))