In [1]:
# !pip install torchtext
# !pip install torch
import torch
from torch import nn
import time
import torchtext
import numpy as np
import pandas as pd
import torch.optim as optim
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from collections import defaultdict, Counter
import os
import sys
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

%config InlineBackend.figure_format = 'retina'
plt.style.use('seaborn')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from multiprocessing import cpu_count

cpu_num = cpu_count() # 自动获取最大核心数目
os.environ ['OMP_NUM_THREADS'] = str(cpu_num)
os.environ ['OPENBLAS_NUM_THREADS'] = str(cpu_num)
os.environ ['MKL_NUM_THREADS'] = str(cpu_num)
os.environ ['VECLIB_MAXIMUM_THREADS'] = str(cpu_num)
os.environ ['NUMEXPR_NUM_THREADS'] = str(cpu_num)
torch.set_num_threads(cpu_num)

### Loading the Text data

In [60]:
class TensorDataset(Dataset):
    def __init__(self, data_tensor, labels_tensor):
        self.data_tensor=data_tensor
        self.labels_tensor=labels_tensor

    def __getitem__(self, item):
        return self.data_tensor[item], self.labels_tensor[item]

    def __len__(self):
        return self.data_tensor.size(0)

class Data_Prepare:
    def __init__(self):
        # public variables
        self.EMBED_DIM=50
        self.vocab=self.load_glove_model('word_embedding/glove.6B.50d/glove.6B.50d.txt')

    def load_glove_model(self,File):
        print("Loading Glove Model")
        glove_model = {}
        with open(File,'r',encoding='utf-8') as f:
            for line in f:
                split_line = line.split()
                word = split_line[0]
                embedding = np.array(split_line[1:], dtype=np.float64)
                glove_model[word] = embedding
        print(f"{len(glove_model)} words loaded!")
        return glove_model

    def extract_word_labels(self, filepath):
        df=pd.read_csv(filepath,delimiter='\t',names=['Word','POS','NP','NER'],skiprows=[0])
        # df=self.word_embedding(df)
        # labels=self.onehot_encode(df.NER)

        sentences=[]
        labels=[]
        label=[]
        sentence=[]
        tag_dict={}
        for index, item in enumerate(set(df.NER)):
            tag_dict[item]=index
        for word,tag in zip(df.Word,df.NER):
            tag_id=tag_dict.get(tag)
            label.append(tag_id)
            sentence.append(word)
            if word =='.':
                labels.append(label)
                sentences.append(sentence)
                sentence=[]
                label=[]

        sentences_embeddings=self.sentence_embedding(sentences)

        data_tensor=torch.Tensor(sentences_embeddings)
        labels_tensor=torch.Tensor(np.array(labels))
        dataset=TensorDataset(data_tensor,labels_tensor)

        # word_embeddings=np.array(df.Embedding.to_list())
        # labels_tensor=torch.Tensor(np.array(labels))
        # data_tensor=torch.Tensor(word_embeddings)
        # dataset=TensorDataset(data_tensor,labels_tensor)

        return dataset

    def sentence_embedding(self, sentences):
        sents_embeddings=[]
        for sent in sentences:
            sent_embeddings=[]
            for word in sent:
                vector=self.vocab.get(str(word).lower())
                if vector is not None:
                    sent_embeddings.append(vector)
                else:
                    sent_embeddings.append(np.zeros(self.EMBED_DIM))
            sents_embeddings.append(sent_embeddings)
        return sents_embeddings


    def word_embedding(self,df):
        embeddings=[]
        for word in df.Word:
            vector=self.vocab.get(str(word).lower())
            if vector is not None:
                embeddings.append(vector)
            else:
                embeddings.append(np.zeros(self.EMBED_DIM))

        df['Embedding']=embeddings
        return df

    def onehot_encode(self, labels):
        labels_to_ids={k:v for v,k in enumerate(set(labels.to_list()))}
        result=[]
        for label in labels:
            vec=np.zeros(len(labels_to_ids))
            vec[labels_to_ids.get(label)]=1
            result.append(vec)
        return result

tran_path='data/train.conll'
dev_path='data/dev.conll'
test_path='data/test.conll'

data_prepare=Data_Prepare()

# train_X, train_y=data_prepare.extract_word_labels(tran_path)
# dev_X, dev_y=data_prepare.extract_word_labels(dev_path)
# test_X, test_y=data_prepare.extract_word_labels(test_path)
train_dataset=data_prepare.extract_word_labels(tran_path)
dev_dataset=data_prepare.extract_word_labels(dev_path)
test_dataset=data_prepare.extract_word_labels(test_path)

Loading Glove Model
400000 words loaded!


ValueError: expected sequence of length 9 at dim 1 (got 34)

In [4]:
tensor_dataloader = DataLoader(train_dataset,
                               batch_size=2,
                               shuffle=True,
                               num_workers=0)


In [19]:
class RNN(nn.Module):
    def __init__(self, params:dict):
        super(RNN,self).__init__()
        self.num_layers=params['num_layers']
        self.hidden_size=params['hidden_size']
        self.batch_size=params['batch_size']
        self.bilstm=nn.LSTM(input_size=params['input_size'],
                            hidden_size=params['hidden_size']//2,
                            num_layers=params['num_layers'],
                            batch_first=True,
                            dropout=params['dropout'],
                            bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.classifier=nn.Linear(params['hidden_size'],params['num_classes'])

        self.hidden = self.init_hidden()


    def forward(self,x):
        # self.hidden = self.init_hidden()
        out, _=self.bilstm(x)
        out=out.view(-1,self.hidden_size)
        out = self.classifier(out)
        return out

    def init_hidden(self):
        return (torch.zeros(self.num_layers*2,self.batch_size,self.hidden_size//2),
                torch.zeros(self.num_layers*2,self.batch_size,self.hidden_size//2))


params={
    'num_layers':1,
    'hidden_size': 100,
    'input_size':50,
    'learning_rate':0.01,
    'optimizer':'adam',
    'epochs':10,
    'batch_size':1,
    'num_classes':10,
    'dropout':0.1
}
rnn=RNN(params=params)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(),lr=params['learning_rate'])



In [6]:

train_mini_set=train_dataset[:10000]
train_mini_set=TensorDataset(train_mini_set[0],train_mini_set[1])

In [7]:
len(train_dataset)

204566

In [22]:


def process_bar(num, total):
    rate = float(num)/total
    ratenum = int(100*rate)
    r = '\rModel training:[{}{}]{}%'.format('*'*ratenum,' '*(100-ratenum), ratenum)
    sys.stdout.write(r)
    sys.stdout.flush()

min_valid_loss = np.inf

train_dataloader = DataLoader(train_mini_set,
                           batch_size=1,
                           shuffle=False,
                           num_workers=0)


dev_dataloader = DataLoader(dev_dataset,
                       batch_size=1,
                       shuffle=False,
                       num_workers=0)

step=int(0.01*len(train_mini_set))
for epoch in range(params['epochs']):
    train_loss=0.

    progress=0.
    for data, label in train_dataloader:
        progress+=1
        if torch.cuda.is_available():
            data, label = data.cuda(), label.cuda()
        optimizer.zero_grad()  # Clear the gradients
        predict=rnn(data) # Forward Pass
        loss=criterion(predict,label)  # Find the Loss
        loss.backward()  # Calculate gradients
        optimizer.step() # Update Weights
        train_loss +=loss.item()
        if progress % step==1:
            process_bar(progress,len(train_mini_set))

    valid_loss=0.
    rnn.eval()
    targets=[]
    golds=[]
    for data, label in dev_dataloader:
        if torch.cuda.is_available():
            data, label = data.cuda(), label.cuda()

        target=rnn(data)
        loss=criterion(target,label)
        valid_loss=loss.item()*data.size(0)

        targets.append(torch.argmax(target).item())
        golds.append(torch.argmax(label).item())

    f1_scores_macro=f1_score(golds,targets,average='macro')
    print("\nMacro average F1_score is{0}".format(f1_scores_macro))

    f1_score_micro=f1_score(golds,targets,average='micro')
    print("Micro average F1_score is{0}".format(f1_score_micro))


    print("\nEpochs: {0} / {1}: Training Loss:{2} \t\t Validation Loss:{3}"
          .format(epoch,params['epochs'],train_loss / len(train_dataloader), valid_loss/len(dev_dataset)))
    if min_valid_loss > valid_loss:
        print('Validation Loss Decreased({0:.6f}--->{1:.6f}) \t Saving The Model'.format(min_valid_loss,valid_loss))
        min_valid_loss = valid_loss
        # Saving State Dict
        torch.save(rnn.state_dict(), 'saved_model.pth')



Model training:[*************************************************************************************************** ]99%
Macro average F1_score is0.4590279463598904
Macro average F1_score is0.8549159509083506

Epochs: 0 / 10: Training Loss:0.20784056379302482 		 Validation Loss:2.6022185808163976e-05
Validation Loss Decreased(inf--->1.342146) 	 Saving The Model
Model training:[*************************************************************************************************** ]99%
Macro average F1_score is0.46293739620292385
Macro average F1_score is0.8554200515733758

Epochs: 1 / 10: Training Loss:0.21082767813749206 		 Validation Loss:2.4449518578977546e-05
Validation Loss Decreased(1.342146--->1.261033) 	 Saving The Model
Model training:[*************************************************************************************************** ]99%
Macro average F1_score is0.4623176871756421
Macro average F1_score is0.8559435407255171

Epochs: 2 / 10: Training Loss:0.21370962603976695 		 Val

KeyboardInterrupt: 

In [9]:
torch.argmax(rnn(train_dataset[2500][0].view(1,50))).item()


1

In [10]:
rnn(train_dataset[2500][0].view(1,50))

tensor([[-15.0669,  -1.4674,  -3.3839,  -3.5529,  -7.3759, -11.6590,  -5.4836,
          -8.9921,  -6.8117, -10.7845]], grad_fn=<AddmmBackward0>)

In [11]:
num=1300
if torch.argmax(rnn(train_dataset[num][0].view(1,50)))==torch.argmax(train_dataset[num][1]):
    print('correct')
else:
    print('wrong')

correct


In [12]:
num_correct=0
for data, label in dev_dataset:
    if torch.argmax(torch.Tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]))==torch.argmax(label):
        num_correct+=1
print('Accuracy(baseline) on the test set is : {0:5.4f}'.format(num_correct/len(dev_dataset)))

num_correct=0
for data, label in dev_dataset:
    if torch.argmax(rnn(data.view(1,50)))==torch.argmax(label):
        num_correct+=1
print('Accuracy(Model) on the test set is : {0:5.4f}'.format(num_correct/len(dev_dataset)))

Accuracy(baseline) on the test set is : 0.0179
Accuracy(Model) on the test set is : 0.8499


In [16]:
def test(model, device, test_loader):
    model.eval() # 必备，将模型设置为评估模式
    test_loss = 0
    correct = 0
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad(): # 禁用梯度计算
        for data, target in test_loader: # 从数据加载器迭代一个batch的数据
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item() # sum up batch loss
            # print(output.max(1, keepdim=True)[1])
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.max(1, keepdim=True)[1]).sum().item() # 统计预测正确个数

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

test_dataloader = DataLoader(test_dataset,
                           batch_size=1,
                           shuffle=False,
                           num_workers=0)
test(model=rnn,device=device,test_loader=test_dataloader)


Test set: Average loss: 1.4603, Accuracy: 37821/46665 (81%)



In [14]:
rnn

RNN(
  (bilstm): LSTM(50, 50, batch_first=True, dropout=0.1, bidirectional=True)
  (classifier): Linear(in_features=100, out_features=10, bias=True)
)

In [15]:
embeddings=[]
EMBED_DIM=50
for word in tran_pd.Word:
    vector=vocab.get(str(word).lower())
    if vector is not None:
        embeddings.append(vector)
    else:
        embeddings.append(np.zeros(EMBED_DIM))

tran_pd['Embedding']=embeddings


NameError: name 'tran_pd' is not defined

In [None]:
tran_pd

In [None]:
labels=tran_pd.NER


def onehot_encode(labels):
    labels_to_ids={k:v for v,k in enumerate(set(labels.to_list()))}
    result=[]
    for label in labels:
        vec=np.zeros(len(labels_to_ids))
        vec[labels_to_ids.get(label)]=1
        result.append(vec)
    return result

labels_OHE=onehot_encode(labels) # label encoded in one hot format
len(labels_OHE)



In [None]:
tran_X=tran_pd.Embedding.to_list()
tran_y=labels_OHE

In [27]:
filepath='data/train.conll'

df=pd.read_csv(filepath,delimiter='\t',names=['Word','POS','NP','NER'],skiprows=[0])

In [18]:
len(df.loc[df.NER=='O'])/len(df)

0.8122147219543555

In [46]:
sentences=[]
labels=[]
label=[]
sentence=[]
tag_dict={}
for index, item in enumerate(set(df.NER)):
    tag_dict[item]=index
for word,tag in zip(df.Word,df.NER):
    tag_id=tag_dict.get(tag)
    label.append(tag_id)
    sentence.append(word)
    if word =='.':
        labels.append(label)
        sentences.append(sentence)
        sentence=[]
        label=[]

In [47]:
num=52
len(sentences[num]),len(labels[num])

(13, 13)

In [49]:
labels[0]

[7, 9, 1, 9, 9, 9, 1, 9, 9]

In [54]:
maxlen=0
for sent in sentences:
    if len(sent)>maxlen:
        maxlen=len(sent)
        if maxlen>1200:
            print(' '.join(sent))
print('max length of a sentence from train is {0}'.format(maxlen))

BRUSSELS 1996-08-23 Leading results in the Brussels Grand Prix athletics meeting on Friday : Women 's discus 1. Ilke Wyludda ( Germany ) 66.60 metres 2. Ellina Zvereva ( Belarus ) 65.66 3. Franka Dietzsch ( Germany ) 61.74 4. Natalya Sadova ( Russia ) 61.64 5. Mette Bergmann ( Norway ) 61.44 6. Nicoleta Grasu ( Romania ) 61.36 7. Olga Chernyavskaya ( Russia ) 60.46 8. Irina Yatchenko ( Belarus ) 58.92 Women 's 100 metres hurdles 1. Ludmila Engquist ( Sweden ) 12.60 seconds 2. Michelle Freeman ( Jamaica ) 12.77 3. Aliuska Lopez ( Cuba ) 12.85 4. Dionne Rose ( Jamaica ) 12.88 5. Brigita Bukovec ( Slovakia ) 12.95 6. Yulia Graudin ( Russia ) 12.96 7. Julie Baumann ( Switzerland ) 13.36 8. Patricia Girard-Leno ( France ) 13.36 9. Dawn Bowles ( U.S. ) 13.53 Men 's 110 metres hurdles 1. Allen Johnson ( U.S. ) 12.92 seconds 2. Colin Jackson ( Britain ) 13.24 3. Emilio Valle ( Cuba ) 13.33 4. Sven Pieters ( Belgium ) 13.37 5. Steve Brown ( U.S. ) 13.38 6. Frank Asselman ( Belgium ) 13.64 7. Hu

In [64]:
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r',encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model
vocab=load_glove_model('word_embedding/glove.6B.50d/glove.6B.50d.txt')

Loading Glove Model
400000 words loaded!


In [65]:
sentences
def sentence_embedding(sentences):
    sents_embeddings=[]
    for sent in sentences:
        sent_embeddings=[]
        for word in sent:
            vector=vocab.get(str(word).lower())
            if vector is not None:
                sent_embeddings.append(vector)
            else:
                sent_embeddings.append(np.zeros(50))
        sents_embeddings.append(sent_embeddings)
    return sents_embeddings
sents_embeddings=sentence_embedding(sentences)

In [69]:
torch.Tensor(sents_embeddings[:1])

tensor([[[ 2.8050e-01,  9.6134e-02, -4.0411e-01, -4.3212e-01, -2.1813e-01,
           3.9900e-01,  1.1994e-01, -5.8819e-01,  1.6138e-01, -9.0326e-01,
           6.8040e-01,  7.9212e-02, -2.0700e-01,  8.6111e-01,  1.0581e+00,
           5.9307e-01,  9.3522e-01, -1.1357e+00,  9.1209e-01, -7.3258e-01,
           2.9839e-01, -6.5858e-01, -5.4395e-01,  1.2593e-01,  9.7543e-03,
          -1.4733e+00,  9.9091e-01,  4.3032e-01, -9.9453e-01,  6.8398e-01,
           3.0013e+00,  8.9257e-01, -1.1240e+00,  3.7326e-01, -1.1387e+00,
          -1.0753e+00, -9.3879e-02,  1.1037e-01, -5.2445e-01, -3.6921e-01,
          -1.1846e-01,  1.0811e-01,  1.6898e+00, -1.2099e+00, -3.5203e-01,
           6.7106e-01, -3.0708e-01,  1.7115e+00,  2.9010e-01, -4.1397e-01],
         [-1.5761e-01, -1.3796e-01, -4.2215e-01,  2.5714e-01,  2.7350e-01,
           8.0252e-01,  7.5804e-01,  2.5174e-01, -1.1099e-02,  5.5110e-01,
          -1.5435e-01, -8.2309e-02,  8.4994e-02, -2.3917e-01,  8.5194e-02,
           4.7798e-01,  