In [42]:
from gensim.models import word2vec
from gensim.models import Word2Vec
import numpy as np
import os
from torch import nn
import torch
from torch.utils import data
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
import pandas as pd

In [46]:
def read_data(path, data_mode):
    with open(path, 'r') as f:
        sentences = f.readlines()
        if data_mode == "train_with_label":
            x = [sentence.strip("\n").split(" ")[2:] for sentence in sentences]
            y = [sentence[0] for sentence in sentences]
            return x, y
        elif data_mode =="train_no_label":
            x = [sentence.strip("\n").split(" ") for sentence in sentences]
            return x
        else:
            x = [(" ".join(sentence.strip("\n").split(",")[1:])).split(" ") for sentence in sentences[1:]]
            return x  

In [3]:
# w2v.py
# 训练word embedding的model
def train_word2vec():
    print("loading data...")
    train_x, _ = read_data("DATA/hw4/training_label.txt", "train_with_label")
    train_x_no_label = read_data("DATA/hw4/training_nolabel.txt", "train_no_label")
    test_x = read_data("DATA/hw4/testing_data.txt","test")
    
    print("train word2vec model...")
    # list累加--> concat
    x = train_x + train_x_no_label + test_x
    model = word2vec.Word2Vec(x, size=250, window=10, min_count=5, workers=12, iter=20, sg=1)
    
    print("saving model...")
    model.save("DATA/hw4/models/w2v_all.model")

In [4]:
train_word2vec()

loading data...
train word2vec model...
saving model...


## preprocess data

In [5]:
# preprocess.py
from gensim.models import Word2Vec
class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path):
        self.sentences = sentences
        self.sen_len = sen_len
        self.embedding = Word2Vec.load(w2v_path)
        self.embedding_dim = self.embedding.vector_size
        self.word2idx = {}
        self.embedding_matrix = []
    
    # 补充embedding，例如PAD, UNK
    def add_embedding(self, word):
        vector = torch.empty(self.embedding_dim,1)
        # 每次的值都不一样
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.embedding_matrix.append(vector)
    
    # 制作embedding matrix
    def make_embedding(self):
        for i, word in enumerate(self.embedding.wv.vocab):
            self.word2idx[word] = len(self.word2idx)
            self.embedding_matrix.append(self.embedding[word])
        
        # add <PAD> and <UNK> to embedding
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        
        # 从二维list转为tensor
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        return self.embedding_matrix
    
    # 把sentence截取或补全成一样长度
    def pad_sentence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        return sentence
    
    # 把训练集中每个sentence都转换成word embedding的形式
    def sentence_word2idx(self):
        # sentence_list中存的是每个sentence中word转为idx的list
        # 其实也就是one hot的形式
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            sentence_idx = []
            for word in sen:
                if word not in self.word2idx.keys():
                    word = "<UNK>"
                sentence_idx.append(self.word2idx[word])
            sentence_idx = self.pad_sentence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    
    # 把 y 转成int
    def labels_to_tensor(self, y):
        y = [int(label) for label in y]
        return torch.LongTensor(y)

## Dataset

In [19]:
# data.py
# 需实现__init__(), __getitem__(), __len__()
class TwitterDataset(data.Dataset):
    def __init__(self, X, y):
        self.data = X
        self.label = y
    
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        else: return self.data[idx], self.label[idx]
    
    def __len__(self):
        return len(self.data)

## Model

In [7]:
# model.py
class LSTM_Net(nn.Module):
    # 此处的embedding是embedding matrix
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        # embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0), embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding.size(1)
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential(nn.Dropout(dropout),
                                         nn.Linear(hidden_dim, 1),
                                         nn.Sigmoid())
    
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # x 的 dimension （batch, seq_len, hidden_size）
        # 取 LSTM 最后一层的hidden state
        x = x[:, -1, :]
        x = self.classifier(x)
        return x

## Train

In [37]:
def evaluation(outputs, labels):
    # outputs 是 probability（float, 0.0 ~ 1.0）
    # labels 是 float( 0 or 1)
    outputs[outputs>=0.5] = 1
    outputs[outputs<0.5] = 0
    correct = torch.sum(torch.eq(outputs, labels)).item()

    return correct

In [38]:
# train.py
def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
    # 全部的参数
    total = sum(p.numel() for p in model.parameters())
    # 需要模型训练的参数
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("\n start training, parameter total:{}, trainable:{}\n".format(total, trainable))
    
    
    # 定义损失函数, BCELoss --> Binary cross entropy loss
    criteration = nn.BCELoss()
    t_batch = len(train) # 此处train的len，即代表train中有几个batch
    v_batch = len(valid) # 同上
    # optimizer定义为Adam
    optimizer = optim.Adam( model.parameters(), lr=lr)
    # 记录模型训练
    total_loss, total_acc, best_acc = 0, 0, 0
    
    
    for epoch in range(n_epoch):
        # 模型设为train模式
        model.train()
        total_loss, total_acc = 0, 0
        # 做 training
        for i, (inputs, labels) in enumerate(train):
            # device 为 cuda， inputs转为 torch.cuda.LongTensor
            inputs = inputs.to(device, dtype=torch.long)
            # device 为 cuda，inputs转为 torch.cuda.FloatTensor, 因为要给criteration，所以需要是float
            labels = labels.to(device, dtype=torch.float)
            
            # 保证每个batch训练前 loss.backward()的gradient归零，否则会自动累加
            optimizer.zero_grad()
            # 把 input 喂给模型
            outputs = model(inputs)
            # 把outputs最外层dimension去掉，以喂给criteration
            outputs = outputs.squeeze()
            # 把outputs给criteration，计算当前loss
            loss = criteration(outputs, labels)
            # 计算当前loss的gradient
            loss.backward()
            # 更新训练模型参数
            optimizer.step()
            # 计算此时模型的acc
            correct = evaluation(outputs, labels)
            
            
            total_loss += loss.item()
            total_acc += (correct / batch_size)
            
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f}% '.format(
                epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))
        
        
        # 做 validation
        model.eval()
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long)
                labels = labels.to(device, dtype=torch.float)
                
                outputs = model(inputs)
                outputs = outputs.squeeze()
                loss = criteration(outputs, labels)
                correct = evaluation(outputs, labels)
                
                total_acc += (correct / batch_size)
                total_loss += loss.item()
            
            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            
            # 如果 total_acc 比 best_acc 要好，更新best_acc
            if total_acc > best_acc:
                best_acc = total_acc
                torch.save(model, "{}/ckpt.model".format(model_dir))
                print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
        print('-----------------------------------------------')

## Main

In [10]:
os.environ["CUDA_VISIBLE_DEVICES"]='0'
torch.cuda.get_device_name()

'TITAN RTX'

In [39]:
# main.py
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# 超参数
fix_embedding = True
batch_size = 128
sen_len = 30 # 每个句子30个字
epoch = 5
lr = 0.001
model_dir = "DATA/hw4/models/"


# 读取数据
print("loading data...")
train_x, y = read_data("DATA/hw4/training_label.txt", "train_with_label")
train_x_no_label = read_data("DATA/hw4/training_nolabel.txt", "train_no_label")
test_x = read_data("DATA/hw4/testing_data.txt","test")

# 对数据做预处理
print("preprocess data...")
w2v_path = "DATA/hw4/models/w2v_all.model"
preprocess = Preprocess(train_x, sen_len, w2v_path)
# 得到embedding即embedding matrix
embedding = preprocess.make_embedding()
# 把train_x中每条数据（即每个句子）中每个词转换成idx的形式，每个句子相当于一条数据
train_x = preprocess.sentence_word2idx()
# 把 y 转换成 int 形式
y = preprocess.labels_to_tensor(y)


# 把data分为training data和validation data
print("build dataloader...")
X_train, X_val, y_train, y_val = train_x[:190000], train_x[190000:], y[:190000], y[190000:]

# 把data做成dataset供dataloader使用
train_dataset = TwitterDataset(X = X_train, y = y_train)
val_dataset = TwitterDataset(X = X_val, y = y_val)

# 把dataset放到dataloader里面，转成batch of tensors
train_loader = torch.utils.data.DataLoader( dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            num_workers = 8)
val_loader = torch.utils.data.DataLoader(  dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 8)

# 制作一个model
print("building model...")
model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=250, num_layers=1, dropout=0.5,
                fix_embedding=fix_embedding)
# device 如果是“cuda”，model就使用GPU来训练
model = model.to(device)

training( batch_size, epoch, lr, model_dir, train_loader, val_loader, model, device)

device: cuda
loading data...
preprocess data...




build dataloader...
building model...

 start training, parameter total:14447001, trainable:502251

[ Epoch1: 1485/1485 ] loss:0.350 acc:30.469% 
Train | Loss:0.50145 Acc: 74.080
Valid | Loss:0.42819 Acc: 79.638 
saving model with acc 79.638
-----------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


[ Epoch2: 1485/1485 ] loss:0.440 acc:32.031% 
Train | Loss:0.41722 Acc: 80.865
Valid | Loss:0.41070 Acc: 80.311 
saving model with acc 80.311
-----------------------------------------------
[ Epoch3: 1485/1485 ] loss:0.370 acc:32.812% 
Train | Loss:0.39673 Acc: 82.109
Valid | Loss:0.39789 Acc: 81.201 
saving model with acc 81.201
-----------------------------------------------
[ Epoch4: 1485/1485 ] loss:0.350 acc:33.594% 
Train | Loss:0.37913 Acc: 82.943
Valid | Loss:0.39271 Acc: 81.250 
saving model with acc 81.250
-----------------------------------------------
[ Epoch5: 1485/1485 ] loss:0.406 acc:31.250% 
Train | Loss:0.36120 Acc: 83.960
Valid | Loss:0.39216 Acc: 81.784 
saving model with acc 81.784
-----------------------------------------------


## Test

In [41]:
# test.py
# 对testing_data.txt做预测
def testing(test_loader, model, device):
    # 在evaluation模式下进行预测
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs>=0.5] = 1
            outputs[outputs<0.5] = 0
            print(outputs)
            print(outputs.int())
            print(outputs.int().tolist())
            print(a)
            ret_output += outputs.int().tolist()
    return ret_output

In [49]:
# predict and write to csv file
print("loading testing data...")
test_x = read_data("DATA/hw4/testing_data.txt","test")

# test时，也要记得做embedding处理
preprocess = Preprocess(test_x, sen_len, w2v_path)
embedding = preprocess.make_embedding()
test_x = preprocess.sentence_word2idx()

test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size = batch_size,
                                          shuffle=False,
                                          num_workers = 8)

print("loading model...")
model = torch.load(os.path.join(model_dir, "ckpt.model"))
outputs = testing(test_loader, model, device)

# 写到csv中
tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))], "label":outputs})
print("saving csv...")
tmp.to_csv(os.path.join(model_dir, "predict.csv"), index=False)
print("Finish predicting")

loading testing data...




loading model...
tensor([0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
        1., 1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0.,
        0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 0.,
        0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
        1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0.,
        1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1.,
        1., 0.], device='cuda:0')
tensor([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
        1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0

NameError: name 'a' is not defined