In [1]:
import pandas as pd

df = pd.read_csv('data/train.tsv/train.tsv',sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [2]:
df['Phrase'][0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [3]:
import string 

'''
    单词预处理，将单词全部小写，并且去除标点符号
'''
def preprocessing(phrase):
    lower = [phras.lower() for phras in phrase]    # 将字母全部小写
    no_punct = [text.translate(str.maketrans('','',string.punctuation)) for text in lower]   # 去掉标点符号
    sp = [text.split() for text in no_punct]
    res = [' '.join(lis) for lis in sp]

    return res

df['Phrase'] = preprocessing(df['Phrase'])
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,2,1,a series of escapades demonstrating the adage ...,2
2,3,1,a series,2
3,4,1,a,2
4,5,1,series,2


In [4]:
df['Phrase'][0]

'a series of escapades demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amounts to much of a story'

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset,TensorDataset
import numpy as np


# 定义数据读取类
class MyDataset(Dataset):
    """
    步骤一：继承paddle.io.Dataset类
    """
    def __init__(self,df,vocab_path='data/glove.6B/glove.6B.300d.txt',word_len=300):
        """
        步骤二：实现构造函数，定义数据读取方式，划分训练和测试数据集
        """
        super(MyDataset, self).__init__()
        
        self.vocab_path = vocab_path
        self.wordlen = word_len
        _, _, self.word_to_vec_map = self.load_glove_embeddings()


        self.data =np.nan_to_num(np.array([self.sentence_to_avg(text) for text in df['Phrase']]),nan=0)


        self.label = [la for la in df['Sentiment']]


    def __getitem__(self, index):
        """
        步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据，对应的标签)
        """
        d = torch.tensor(self.data[index],dtype=torch.float32)   
        l = torch.tensor(self.label[index],dtype=torch.long)

        return d,l

    def __len__(self):
        """
        步骤四:实现__len__方法:返回数据集总数目
        """
        return self.data.shape[0]
    

    # 将句子转换为向量
    def sentence_to_avg(self,sentence):
        words = sentence.lower().strip().split()
        
        avg = np.zeros(self.wordlen,)
        
        for w in words:
            if w in self.word_to_vec_map.keys():  # 如果不在词表里面，则该向量设置为全零
                avg += self.word_to_vec_map[w]
        
        avg = avg / len(words)
        
        return avg

    # 加载GloVe词嵌入
    def load_glove_embeddings(self):
        with open(self.vocab_path, 'r', encoding='utf-8') as f:
            words = set()
            word_to_vec_map = {}
            
            for line in f:
                line = line.strip().split()
                curr_word = line[0]
                words.add(curr_word)
                word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float32)
            
            i = 1
            words_to_index = {}
            index_to_words = {}
            for w in sorted(words):
                words_to_index[w] = i
                index_to_words[i] = w
                i = i + 1
        return words_to_index, index_to_words, word_to_vec_map

In [6]:
from sklearn.model_selection import train_test_split

train_df,dev_df = train_test_split(df,test_size=0.1,random_state=42)

In [7]:
train_dataset = MyDataset(train_df)
train_loader = DataLoader(train_dataset,32,shuffle=True)

dev_dataset = MyDataset(dev_df)
dev_loader = DataLoader(dev_dataset,32,shuffle=True)

  avg = avg / len(words)


In [8]:
cnt = 0
for trains, labels in train_dataset:
    if cnt == 0:

        print(trains)
        print(labels)
    cnt += 1

tensor([ 2.1276e-03, -2.2197e-01,  6.4326e-02, -2.7800e-01, -6.1729e-02,
        -1.9566e-01, -1.9008e-01, -4.1553e-02, -1.0493e-01, -5.0115e-01,
         1.4418e-01, -6.9267e-02, -2.1731e-01,  1.1661e-02, -2.0700e-04,
         9.1448e-02, -1.3151e-01,  1.0881e-02,  1.4332e-01,  4.0839e-02,
         5.5388e-02, -1.0229e-01,  1.5621e-01, -1.4271e-02,  1.6602e-02,
         8.3729e-02,  1.1173e-01, -2.0185e-01, -1.3646e-02, -1.8593e-01,
         1.4688e-01,  1.5315e-01,  1.9309e-01,  1.2108e-01, -4.7660e-01,
        -1.2649e-01,  2.2400e-01,  1.9865e-01,  3.9129e-02,  2.7049e-02,
         6.7611e-02,  1.2576e-02, -1.3100e-01,  1.5875e-01, -5.0877e-02,
        -1.3839e-01, -6.2898e-02,  5.5798e-02, -4.2659e-02, -1.5957e-01,
        -2.6173e-03, -1.2671e-01, -7.8191e-02, -6.9357e-02,  9.5163e-02,
         1.0222e-01,  7.8807e-02,  1.7340e-01,  2.4614e-02, -9.5348e-02,
        -2.8437e-02, -7.5854e-02,  9.2298e-02,  1.0740e-01, -1.6900e-02,
        -2.2181e-01, -7.6209e-02, -2.9010e-01,  3.6

训练NN模型

In [9]:
from importlib import import_module

model_name = 'NN'

x = import_module('models.' + model_name)
config = x.Config()
print('all class number : ',config.num_classes)

all class number :  5


In [10]:
from train import train

model = x.Model(config).to(config.device)

train(config,model,train_loader,dev_loader,dev_loader)

Epoch [1/50]


100%|██████████| 4390/4390 [00:24<00:00, 179.10it/s]


train loss : 0.9609 ,train acc:0.602 , dev loss : 0.9788,dev acc : 0.592 
saving model ...
Epoch [2/50]


100%|██████████| 4390/4390 [00:23<00:00, 188.29it/s]


train loss : 0.9047 ,train acc:0.622 , dev loss : 0.9430,dev acc : 0.608 
saving model ...
Epoch [3/50]


100%|██████████| 4390/4390 [00:23<00:00, 186.42it/s]


train loss : 0.8572 ,train acc:0.639 , dev loss : 0.9113,dev acc : 0.615 
saving model ...
Epoch [4/50]


100%|██████████| 4390/4390 [00:23<00:00, 184.84it/s]


train loss : 0.8034 ,train acc:0.664 , dev loss : 0.9019,dev acc : 0.625 
saving model ...
Epoch [5/50]


100%|██████████| 4390/4390 [00:23<00:00, 187.07it/s]


train loss : 0.7736 ,train acc:0.679 , dev loss : 0.8889,dev acc : 0.634 
saving model ...
Epoch [6/50]


100%|██████████| 4390/4390 [00:23<00:00, 187.25it/s]


train loss : 0.7400 ,train acc:0.693 , dev loss : 0.8775,dev acc : 0.642 
saving model ...
Epoch [7/50]


100%|██████████| 4390/4390 [00:23<00:00, 184.75it/s]


train loss : 0.7137 ,train acc:0.702 , dev loss : 0.8909,dev acc : 0.641 
Epoch [8/50]


100%|██████████| 4390/4390 [00:23<00:00, 185.67it/s]


train loss : 0.6842 ,train acc:0.714 , dev loss : 0.8965,dev acc : 0.640 
Epoch [9/50]


100%|██████████| 4390/4390 [00:23<00:00, 187.43it/s]


train loss : 0.6519 ,train acc:0.728 , dev loss : 0.8886,dev acc : 0.644 
Epoch [10/50]


100%|██████████| 4390/4390 [00:23<00:00, 188.31it/s]


train loss : 0.6438 ,train acc:0.734 , dev loss : 0.9026,dev acc : 0.642 
Epoch [11/50]


100%|██████████| 4390/4390 [00:23<00:00, 186.64it/s]


train loss : 0.6121 ,train acc:0.746 , dev loss : 0.8967,dev acc : 0.651 
Epoch [12/50]


100%|██████████| 4390/4390 [00:23<00:00, 186.04it/s]


train loss : 0.5975 ,train acc:0.749 , dev loss : 0.9429,dev acc : 0.645 
Epoch [13/50]


 17%|█▋        | 766/4390 [00:04<00:19, 186.61it/s]


KeyboardInterrupt: 

使用NN模型预测结果

In [11]:
df = pd.read_csv('data/test.tsv/test.tsv',sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [12]:
from tqdm import tqdm

'''
    使用model模型预测test_loader标签
'''
def predict_test_data(config,model,test_loader):
    pred = []
    for X,y in tqdm(test_loader):

        data = X.to(config.device)
        out = model(data)
        lab = out.argmax(dim=1) #argmax():返回最大数的索引


        pred.extend(lab.detach().cpu().numpy().tolist())
    return pred

In [13]:
df = pd.read_csv('data/test.tsv/test.tsv',sep='\t')
df['Phrase'] = preprocessing(df['Phrase'])

df['Sentiment'] = [2] * df.shape[0]

In [14]:
test_dataset = MyDataset(df)
test_loader = DataLoader(test_dataset,32,shuffle=False)

  avg = avg / len(words)


In [15]:
model_name = 'output/NN_2023-02-17_13-04-32'
print('load_model name : ',model_name)
net = x.Model(config).to(config.device)
net.load_state_dict(torch.load(model_name))

test_pred = predict_test_data(config,net,test_loader)

load_model name :  output/NN_2023-02-17_13-04-32


100%|██████████| 2072/2072 [00:02<00:00, 914.44it/s] 


In [16]:
Submission_path = 'data/sampleSubmission.csv'
submission = pd.read_csv(Submission_path)
submission.Sentiment = test_pred

In [17]:
submission.to_csv('NN_submission.csv',index=False)