In [1]:
import pandas as pd
import string 

'''
    单词预处理，将单词全部小写，并且去除标点符号
'''
def preprocessing(phrase):
    lower = [phras.lower() for phras in phrase]    # 将字母全部小写
    no_punct = [text.translate(str.maketrans('','',string.punctuation)) for text in lower]   # 去掉标点符号
    sp = [text.split() for text in no_punct]
    res = [' '.join(lis) for lis in sp]

    return res


def Get_Preprocess_Data(path):
    df = pd.read_csv(path,sep='\t')
    print("原始数据标签统计： ")
    print(df['gold_label'].value_counts())
    df = df[['gold_label','sentence1','sentence2']]
    print("")
    print("去除Nan非法制以及非法标签中... ")
    print("")
    df = df.dropna()
    df = df[df.gold_label.isin(['entailment','neutral','contradiction'])]
    print("处理后数据标签统计： ")
    print(df.gold_label.value_counts())


    df['sentence1'] = preprocessing(df['sentence1'])
    df['sentence2'] = preprocessing(df['sentence2'])
    
    return df

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset,TensorDataset
import numpy as np


# 定义数据读取类
class MyDataset(Dataset):
    """
    步骤一：继承paddle.io.Dataset类
    """
    def __init__(self,df,vocab_path='data/glove.6B/glove.6B.300d.txt',max_sentence_len=64,word_len=300):
        """
        步骤二：实现构造函数，定义数据读取方式，划分训练和测试数据集
        """
        super(MyDataset, self).__init__()
        
        self.vocab_path = vocab_path
        self.wordlen = word_len
        self.max_sentence_len = max_sentence_len
        _, _, self.word_to_vec_map = self.load_glove_embeddings()

        self.text1 = df['sentence1'].values
        self.text2 = df['sentence2'].values

        #self.data =np.nan_to_num(np.array([self.sentence_to_avg(text) for text in df['Phrase']]),nan=0)


        self.label = [['entailment','neutral','contradiction'].index(la) for la in df['gold_label']]


    def __getitem__(self, index):
        """
        步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据，对应的标签)
        """

        data1,mask1 = self.sentence_to_avg(self.text1[index])
        convert_data1 = np.nan_to_num(np.array(data1),nan=0)
        d1 = torch.tensor(convert_data1,dtype=torch.float32)  

        data2,mask2 = self.sentence_to_avg(self.text2[index])
        convert_data2 = np.nan_to_num(np.array(data2),nan=0)
        d2 = torch.tensor(convert_data2,dtype=torch.float32)  

        l = torch.tensor(self.label[index],dtype=torch.long)

        return (d1,d2,mask1,mask2),l

    def __len__(self):
        """
        步骤四:实现__len__方法:返回数据集总数目
        """
        return len(self.label)
    

    # 将句子转换为向量
    def sentence_to_avg(self,sentence):
        words = sentence.lower().strip().split()
        
        se = np.zeros((self.max_sentence_len,self.wordlen))
        mask = np.zeros((self.max_sentence_len,))
        
        for i in range(min(self.max_sentence_len,len(words))):
            if words[i] in self.word_to_vec_map.keys():  # 如果不在词表里面，则该向量设置为全零
                se[i,:]= self.word_to_vec_map[words[i]]
            mask[i] = 1

        return se,mask

    # 加载GloVe词嵌入
    def load_glove_embeddings(self):
        with open(self.vocab_path, 'r', encoding='utf-8') as f:
            words = set()
            word_to_vec_map = {}
            
            for line in f:
                line = line.strip().split()
                curr_word = line[0]
                words.add(curr_word)
                word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float32)
            
            i = 1
            words_to_index = {}
            index_to_words = {}
            for w in sorted(words):
                words_to_index[w] = i
                index_to_words[i] = w
                i = i + 1
        return words_to_index, index_to_words, word_to_vec_map

In [3]:
train_df = Get_Preprocess_Data('../Datasets/snli_1.0/snli_1.0/snli_1.0_train.txt')
dev_df = Get_Preprocess_Data('../Datasets/snli_1.0/snli_1.0/snli_1.0_dev.txt')
test_df = Get_Preprocess_Data('../Datasets/snli_1.0/snli_1.0/snli_1.0_test.txt')


原始数据标签统计： 
entailment       183416
contradiction    183187
neutral          182764
-                   785
Name: gold_label, dtype: int64

去除Nan非法制以及非法标签中... 

处理后数据标签统计： 
entailment       183414
contradiction    183185
neutral          182762
Name: gold_label, dtype: int64
原始数据标签统计： 
entailment       3329
contradiction    3278
neutral          3235
-                 158
Name: gold_label, dtype: int64

去除Nan非法制以及非法标签中... 

处理后数据标签统计： 
entailment       3329
contradiction    3278
neutral          3235
Name: gold_label, dtype: int64
原始数据标签统计： 
entailment       3368
contradiction    3237
neutral          3219
-                 176
Name: gold_label, dtype: int64

去除Nan非法制以及非法标签中... 

处理后数据标签统计： 
entailment       3368
contradiction    3237
neutral          3219
Name: gold_label, dtype: int64


In [4]:
from importlib import import_module

model_name = 'ESIM'

x = import_module('models.' + model_name)
config = x.Config()
print('all class number : ',config.num_classes)

train_dataset = MyDataset(train_df,max_sentence_len=64)
train_loader = DataLoader(train_dataset,config.batch_size,shuffle=True)

dev_dataset = MyDataset(dev_df,max_sentence_len=64)
dev_loader = DataLoader(dev_dataset,config.batch_size,shuffle=True)

test_dataset = MyDataset(test_df,max_sentence_len=64)
test_loader = DataLoader(test_dataset,config.batch_size,shuffle=False)

all class number :  3


In [5]:
for X,y in dev_loader:
    print(X[0])
    print(X[1])
    print(X[2])
    print(X[3])
    print(y)
    break

X = [d.to(config.device) for d in X]

tensor([[[-0.2971,  0.0940, -0.0967,  ...,  0.0597, -0.2285,  0.2960],
         [-0.5182, -0.1381, -0.4119,  ...,  0.3734, -0.0302,  0.7301],
         [-0.3018, -0.1055,  0.1984,  ..., -0.4849, -0.3342,  0.3819],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.2971,  0.0940, -0.0967,  ...,  0.0597, -0.2285,  0.2960],
         [-0.1035,  0.1513,  0.3629,  ...,  0.3170, -0.4615, -0.1909],
         [-0.5316, -0.0945, -0.6418,  ...,  0.2645, -0.0780,  0.4768],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.2971,  0.0940, -0.0967,  ...,  0.0597, -0.2285,  0.2960],
         [-0.2978, -0.1326, -0.1451,  ...,  0

In [6]:
from train import train

model = x.Model(config).to(config.device)

train(config,model,train_loader,dev_loader,test_loader)

Epoch [1/50]


100%|██████████| 8584/8584 [11:58<00:00, 11.95it/s]


train loss : 0.7625 ,train acc:0.783 , dev loss : 0.7505,dev acc : 0.795 ,test acc : 0.800
saving model ...
Epoch [2/50]


  8%|▊         | 686/8584 [00:56<10:53, 12.09it/s]


KeyboardInterrupt: 