导入需要的库

In [2]:
import nltk
import numpy as np
from collections import Counter
import json
import time
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from torchvision import transforms
import math
import chardet
import pandas as pd
from sklearn.model_selection import train_test_split

数据预处理

In [3]:
def get_dataset(values):
    dataset = []
    for i in range(len(values)):
        guid = str(int(values[i][0]))
        label = values[i][1]
        if type(label) != str and math.isnan(label):
            label = None
        file = path_text + guid + '.txt'
        with open(file, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']
            if encoding == "GB2312":
                encoding = "GBK"
        text = ''
        try:
            with open(file, encoding=encoding) as fp:
                for line in fp.readlines():
                    line = line.strip('\n')
                    text += line
        except UnicodeDecodeError:
            try:
                with open(file, encoding='ANSI') as fp:
                    for line in fp.readlines():
                        line = line.strip('\n')
                        text += line
            except UnicodeDecodeError:
                print('UnicodeDecodeError')
        dataset.append({
            'text': text,
            'label': label,
            'img': path_text + guid + '.jpg',
        })
    return dataset

path_train = 'train.txt'
path_test = 'test_without_label.txt'
path_text = 'data/'

写入到json文件中

In [4]:
train_data_val = pd.read_csv(path_train)
test_data = pd.read_csv(path_test)
# 划分训练集、验证集
train_data, val_data = train_test_split(train_data_val, test_size=0.2)
train_set = get_dataset(train_data.values)
val_set = get_dataset(val_data.values)
test_set = get_dataset(test_data.values)

with open('data_json/train.json', 'w', encoding="utf-8") as f:
    json.dump(train_set, f)

with open('data_json/val.json', 'w', encoding="utf-8") as f:
    json.dump(val_set, f)

with open('data_json/test.json', "w", encoding="utf-8") as f:
    json.dump(test_set, f)

In [5]:
with open('data_json/train.json','r') as f:
    train_data = json.load(f)
with open('data_json/val.json','r') as f:
    val_data = json.load(f)
with open('data_json/test.json','r') as f:
    test_data = json.load(f)

In [6]:
def process_list(data, flag):
    text_or_label = 'text' if flag == 1 else 'label'
    return [item[text_or_label] for item in data]

# text 和 label 分别存储
train_text_list = process_list(train_data, 1)
val_text_list = process_list(val_data, 1)
train_labels = process_list(train_data, 0)
val_labels = process_list(val_data, 0)

构建词汇表

In [7]:
words = Counter() 
i=0
for text in train_text_list:
    words_list = nltk.word_tokenize(text) 
    words.update(words_list)  
    train_text_list[i] = words_list
    i+=1

words = {k:v for k,v in words.items() if v>1}
words = sorted(words, key=words.get,reverse=True)
words = ['_PAD'] + words
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

In [8]:
i=0
for text in train_text_list:  
    train_text_list[i] = [word2idx[word] if word in word2idx else 0 for word in text]
    i+=1

i=0
for text in val_text_list:
    val_text_list[i] = [word2idx[word] if word in word2idx else 0 for word in nltk.word_tokenize(text)]
    i+=1

In [9]:
def padding(text_list, seq_len):
    features = np.zeros((len(text_list), seq_len),dtype=int)
    i=0
    for text in text_list:
        features[i, -len(text):] = np.array(text)[:seq_len]
        i+=1
    return features

train_text = padding(train_text_list, 200)
val_text = padding(val_text_list, 200)

In [10]:
def change_label(data_labels):
  i = 0
  for label in data_labels:
    if(data_labels[i]=='negative'):
      data_labels[i]=0
    elif(data_labels[i]=='positive'):
        data_labels[i]=1
    elif(data_labels[i]=='neutral'):
        data_labels[i]=2
    i+=1
  return data_labels

train_labels = np.array(change_label(train_labels))
val_labels = np.array(change_label(val_labels))

消融实验：仅文本训练

In [11]:
batch_size = 128

train_Data = TensorDataset(torch.from_numpy(train_text), torch.from_numpy(train_labels))
val_Data = TensorDataset(torch.from_numpy(val_text), torch.from_numpy(val_labels))

train_loader = DataLoader(train_Data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_Data, shuffle=True, batch_size=batch_size)

In [12]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
        return F.max_pool1d(x, kernel_size=x.shape[2]) 
    

class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                          out_channels = c, 
                          kernel_size = k))
        self.decoder = nn.Linear(sum(num_channels), 3)
        self.dropout = nn.Dropout(0.3)

    def forward(self, inputs):
        embeddings = torch.cat((
            self.embedding(inputs), 
            self.constant_embedding(inputs)), dim=2)
        embeddings = embeddings.permute(0, 2, 1)
        encoding = torch.cat([
            self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

embed_size, kernel_sizes, nums_channels = 200, [2, 3, 4], [100, 100, 100]
net = TextCNN(words, embed_size, kernel_sizes, nums_channels)

In [13]:
def train(train_iter, test_iter, net, loss, optimizer,  num_epochs):
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y.to(torch.int64))
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

In [17]:
lr = 0.0005
num_epochs = 10
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
print("text-only training start")
train(train_loader, val_loader, net, loss, optimizer, num_epochs)

text-only training start
epoch 1, loss 0.9825, train acc 0.556, test acc 0.633, time 26.9 sec
epoch 2, loss 0.3883, train acc 0.660, test acc 0.649, time 39.2 sec
epoch 3, loss 0.2196, train acc 0.731, test acc 0.660, time 38.1 sec
epoch 4, loss 0.1417, train acc 0.785, test acc 0.670, time 26.3 sec
epoch 5, loss 0.0968, train acc 0.841, test acc 0.676, time 25.3 sec
epoch 6, loss 0.0667, train acc 0.880, test acc 0.687, time 28.6 sec
epoch 7, loss 0.0478, train acc 0.903, test acc 0.694, time 44.2 sec
epoch 8, loss 0.0340, train acc 0.937, test acc 0.705, time 34.3 sec
epoch 9, loss 0.0252, train acc 0.948, test acc 0.707, time 30.9 sec
epoch 10, loss 0.0192, train acc 0.954, test acc 0.711, time 42.8 sec


消融实验：仅图片训练

In [18]:
class ToDataset(Dataset):
    def __init__(self,main_dir):
        self.dataset=[]
        i=0
        for data in main_dir:
          self.dataset.append([data['img'],data['label'],i])
          i+=1

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        img,label,i=self.dataset[index]
        img_data=self.data_process(Image.open(img))
        if(label=='negative'):
          label = 0
        elif(label=='positive'):
          label = 1
        elif(label=='neutral'):
          label = 2
        elif(label is None):
          label = -1
        return img_data,label

    def data_process(self,x):
        return transforms.Compose(
        [
            transforms.Resize((256,256)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.5,0.5,0.5],
                std=[0.5,0.5,0.5],
            ),
        ]
    )(x)

In [19]:
train_loader = DataLoader(dataset=ToDataset(train_data), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=ToDataset(val_data), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=ToDataset(test_data), batch_size=batch_size)

In [20]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=3,   
                out_channels=16, 
                kernel_size=5,  
                stride=1, 
                padding=2, 
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,   
                out_channels=32,
                kernel_size=5,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=4) 
        )
        self.output = nn.Linear(in_features=32*16*16, out_features=3)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)        
        x = x.view(x.size(0), -1)
        output = self.output(x)
        return output

In [22]:
cnn = CNN()
optimizer = torch.optim.Adam(cnn.parameters(), lr=lr)
print("photo-only training start")
train(train_loader, val_loader, cnn, loss, optimizer, num_epochs)

photo-only training start
epoch 1, loss 0.9144, train acc 0.591, test acc 0.599, time 246.8 sec
epoch 2, loss 0.4454, train acc 0.597, test acc 0.599, time 180.9 sec
epoch 3, loss 0.2936, train acc 0.597, test acc 0.596, time 191.8 sec
epoch 4, loss 0.2179, train acc 0.600, test acc 0.599, time 188.0 sec
epoch 5, loss 0.1728, train acc 0.602, test acc 0.600, time 179.0 sec
epoch 6, loss 0.1414, train acc 0.603, test acc 0.604, time 154.6 sec
epoch 7, loss 0.1193, train acc 0.611, test acc 0.607, time 146.9 sec
epoch 8, loss 0.1016, train acc 0.635, test acc 0.609, time 147.3 sec
epoch 9, loss 0.0882, train acc 0.633, test acc 0.611, time 137.0 sec
epoch 10, loss 0.0758, train acc 0.654, test acc 0.615, time 132.9 sec


多模态融合模型

In [35]:
def multiModel(data,data_loader):
  def predictText(net, vocab, sentence):
      device = list(net.parameters())[0].device
      sentence = torch.tensor([word2idx[word] if word in word2idx else 0 for word in sentence], device=device)
      label = torch.argmax(net(sentence.view((1, -1))), dim=1)
      if(label.item()==0):
        return 'negative'
      elif(label.item()==1):
        return 'positive'
      else:
        return 'neutral'
      
  predict_text=[]
  for i in range(len(data)):
    if(len(nltk.word_tokenize(data[i]['text']))>=4):
      predict_text.append(predictText(net, words, nltk.word_tokenize(data[i]['text'])))
    else:
      predict_text.append('neutral')

  predict=[]
  with torch.no_grad():
      for X, y in data_loader:
          if isinstance(cnn, torch.nn.Module):
              predict.append(cnn(X).argmax(dim=1))

  predict_pic=[]
  for batch in predict:
    for i in batch:
      if(i==0):
        predict_pic.append('negative')
      elif(i==1):
        predict_pic.append('positive')
      elif(i==2):
        predict_pic.append('neutral')

  predict_final=[]
  for i in range(len(data)):
    if(predict_text[i]=='positive' and predict_pic[i]=='positive'):
      predict_final.append('positive')
    elif(predict_text[i]=='positive' and predict_pic[i]=='neutral'):
      predict_final.append('positive')
    elif(predict_text[i]=='positive' and predict_pic[i]=='negative'):
      predict_final.append('positive')
    elif(predict_text[i]=='negative' and predict_pic[i]=='positive'):
      predict_final.append('negative')
    elif(predict_text[i]=='negative' and predict_pic[i]=='neutral'):
      predict_final.append('negative')
    elif(predict_text[i]=='negative' and predict_pic[i]=='negative'):
      predict_final.append('negative')
    elif(predict_text[i]=='neutral' and predict_pic[i]=='positive'):
      predict_final.append('positive')
    elif(predict_text[i]=='neutral' and predict_pic[i]=='negative'):
      predict_final.append('negative')
    else:
      predict_final.append('neutral')

  return predict_final


In [37]:
val_predict = multiModel(val_data,val_loader)
acc_count = 0
for i in range(len(val_data)):
  if val_predict[i] == val_data[i]["label"]:
    acc_count += 1
print(acc_count/len(val_data))

0.6875


In [29]:
test_predict = multiModel(test_data,test_loader)
test_data_file = pd.read_csv("test_without_label.txt")['guid'].values
with open('test_without_label.txt','w') as f:
  f.write('guid,tag\n')
  for i in range(len(test_data_file)):
    f.write(str(test_data_file[i])+','+str(test_predict[i])+'\n')
f.close()