In [3]:
import json
import pdb
import numpy as np
import random
from nltk.corpus import stopwords
import torch
import torch.nn as nn
import torch.nn.functional as F
from copy import deepcopy

In [8]:
class average_meter(object):
    '''Computes and stores the average and current value
    '''
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def load_glove(path = 'GloVe.json'):
    with open('GloVe.json','r',encoding='utf8')as fp:
        json_data = json.load(fp)
    return json_data

def load_model(file_path = "./data/glove.twitter.27B.200d.txt"):
    glove = {}
    with open(file_path, "r", encoding='utf-8') as f:
        for lines in f:
            items = lines.split()
            if len(items) != 201:
                continue
            else:
                word_vector = []
                for i in range(1,201):
                    word_vector.append(float(items[i]))
                glove[items[0]] = word_vector
    UNK = "< UNK >"
    glove[UNK] = np.random.uniform(-0.25, 0.25, 200).tolist()
    return glove

def load_train(file_path = "./data/ISEAR.txt"):
    train_x = []
    train_y = []
    with open(file_path, "r", encoding='utf-8') as f:
        for lines in f:
            y_x = lines.split("|")
            train_y.append(y_x[0])
            train_x.append(y_x[1])
    return train_x, train_y

def load_dev(file_path = "./data/test.txt"):
    dev_x = []
    dev_y = []
    with open(file_path, "r", encoding='utf-8') as f:
        for lines in f:
            y_x = lines.split("|")
            dev_y.append(y_x[0])
            dev_x.append(y_x[2])
    return dev_x, dev_y

def split_train_test(data_X, data_Y, test_ratio):
    
    #combined_lsts = list(zip(data_X, data_Y))
    #random.shuffle(combined_lsts)
    test_set_size = int(len(data_X) * test_ratio)
    #data_X, data_Y = list(zip(*combined_lsts))
    train_x_raw = data_X[:test_set_size]
    train_y_raw = data_Y[:test_set_size]
    dev_x_raw = data_X[test_set_size:]
    dev_y_raw = data_Y[test_set_size:]

    return train_x_raw, train_y_raw, dev_x_raw, dev_y_raw


contractions_dict = {
    "i'm" : "i am",
    "i'll" : "i will",
    "i'd" : "i would",
    "i've" : "i have",
    "you're" : "you are",
    "you'll" : "you will",
    "you'd" : "you would",
    "you've" : "you have",
    "she's" : "she is",
    "she'll" : "she will",
    "he's" : "he is",
    "he'll" : "he will",
    "he'd" : "he would",
    "they're" : "they are",
    "they'll" : "they will",
    "they'd" : "they would",
    "that's" : "that is",
    "that'll" : "that will",
    "that'd" : "that would",
    "who's" : "who is",
    "who'll" : "who will",
    "who'd" : "who would",
    "what's" : "what is",
    "what're" : "what are",
    "what'll" : "what will",
    "what'd" : "what would",
    "where's" : "where is",
    "where'll" : "where will",
    "where'd" : "where would",
    "when's" : "when is",
    "when'll" : "when will",
    "when'd" : "when would",
    "why's" : "why is",
    "why'll" : "why will",
    "why'd" : "why would",
    "how's" : "how is",
    "how'll" : "how will",
    "how'd" : "how would",
    "would've" : "would have",
    "should've" : "should have",
    "could've" : "could have",
    "might've" : "might have",
    "must've" : "must have",
    "isn't" : "is not",
    "aren't" : "are not",
    "wasn't" : "was not",
    "weren't" : "were not",
    "haven't" : "have not",
    "hasn't" : "has not",
    "hadn't" : "had not",
    "won't" : "will not",
    "wouldn't" : "would not",
    "don't" : "do not",
    "doesn't" : "does not",
    "didn't" : "did not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "shouldn't" : "should not",
    "mightn't" : "might not",
    "mustn't" : "must not"
}

def contractionfunction(s):
    if s in contractions_dict.keys():
        return contractions_dict[s]
    return s

def preprocess(sentence_list):
    stop = stopwords.words('english')
    char_replace = {",",".","/",";","'","[","]","\\","!","@","#","$","%","^","&","*","(",")","-","_","=","+","<",">","?",":","\"","{","}","|"}
    for i in range(len(sentence_list)):
        sentence_list[i] = sentence_list[i].lower()
        for char in char_replace:
            if char in sentence_list[i]:
                sentence_list[i] = sentence_list[i].replace(char, " ")
        sentence_list[i] = ' '.join([word for word in sentence_list[i].split() if word not in (stop)])
        sentence_list[i] = sentence_list[i].split()
    return sentence_list

def sort_key(a):
    return a[1]

def generate_label2id(y_data):
    y_count = {}
    label_list = []
    for y in y_data:
        if y in y_count:
            y_count[y] += 1
        else:
            y_count[y] = 1
    
    for key in y_count:
        item = y_count[key]
        label_list.append((key,item))
    label_list.sort(reverse=True,key=sort_key)

    label2id = {}
    i = 0
    for label in label_list:
        label2id[label[0]] = i
        label2id[i] = label[0]
        i+=1
    return label2id

def get_type_glove(unk):
    digits = 0
    for c in unk:
        if c.isdigit():
            digits += 1
    df = digits/len(unk)
    if unk.isdigit():
        return 1.0
    elif df > 0.5:
        return 2.0
    elif digits>0:
        return 3.0
    else:
        return 0.0


In [5]:
import torch.utils.data as datasets

class emotion_dataset(datasets.Dataset):
    
    def __init__(self, word_lists, label_lists):
        self.word_lists = word_lists
        self.label_lists = label_lists

    def __getitem__(self, index):
        return self.word_lists[index], self.label_lists[index]
    
    def __len__(self):
        return len(self.word_lists)

In [6]:
def data_process(data, label2id, GloVe):
    b_size = len(data)
    x_tensor = torch.zeros(b_size, 80, 201)
    y_tensor = torch.zeros(b_size).long()
    
    for b_index in range(b_size):
        x = data[b_index][0]
        y = data[b_index][1]
        for xy_index in range(len(x)):
            word = x[xy_index]
            if word in GloVe:
                x_vector = deepcopy(GloVe[word])
                one_more = get_type_glove(word)
                x_vector.append(one_more)
                _x = torch.FloatTensor(x_vector)
                x_tensor[b_index][xy_index] = _x
            else:
                unk_vector = deepcopy(GloVe["< UNK >"])
                one_more = get_type_glove(word)
                unk_vector.append(one_more)
                _unk = torch.FloatTensor(unk_vector)
                x_tensor[b_index][xy_index] = _unk
        y_tensor[b_index] = label2id[y]
    data_len = []
    for xy in data:
        data_len.append(len(xy[0]))
    return x_tensor, y_tensor, data_len

In [9]:
print('loading Glove dictionary')
#Glove = load_glove()
Glove = load_model()

loading Glove dictionary


In [35]:
def preprocess(sentence_list):
    stop = stopwords.words('english')
    char_replace = {",",".","/",";","'","[","]","\\","!","@","#","$","%","^","*","(",")","_","=","+","<",">","?",":","\"","{","}","|"}
    for i in range(len(sentence_list)):
        sentence_list[i] = sentence_list[i].lower()
        for char in char_replace:
            if char in sentence_list[i]:
                sentence_list[i] = sentence_list[i].replace(char, " ")
        if '-' in sentence_list[i]:
            temp = sentence_list[i].split()
            for word in temp: 
                if '-' in word and word not in Glove:
                    sentence_list[i] = sentence_list[i].replace('-', " ")
        sentence_list[i] = ' '.join([word for word in sentence_list[i].split() if word not in (stop)])
        sentence_list[i] = sentence_list[i].split()
    return sentence_list

In [36]:
n_epochs = 20
lr = 0.001
TRAIN_BATCH_SIZE = 32
DEV_BATCH_SIZE = 96
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print('loading train and test data')
train_x_raw, train_y_raw = load_train(file_path = "./data/train.txt")
dev_x_raw, dev_y_raw = load_train(file_path = "./data/test.txt")
train_x_raw = preprocess(train_x_raw)
dev_x_raw = preprocess(dev_x_raw)


label2id = generate_label2id(train_y_raw)

train_y = []
dev_y = []
for label in train_y_raw:
    train_y.append(label2id[label])
for label in dev_y_raw:
    dev_y.append(label2id[label])

print('preparing Dataset')
train_dataset = emotion_dataset(train_x_raw,train_y_raw)
dev_dataset = emotion_dataset(dev_x_raw,dev_y_raw)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=False, collate_fn= lambda x: data_process(x, label2id, Glove))
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=DEV_BATCH_SIZE, shuffle=False, collate_fn= lambda x: data_process(x, label2id, Glove))

loading train and test data
preparing Dataset


In [37]:
def train_cnn(model, train_loader, criterion, optimizer, epoch, DEVICE):
    train_loss = average_meter()
    model.train()
    for i in train_loader:
        x = i[0].to(DEVICE)
        y = i[1].to(DEVICE)
        l = i[2]
        pred = model.forward(x)
        optimizer.zero_grad()
        loss = criterion(pred, y).to(DEVICE)
        train_loss.update(loss.item(),x.size(0))
        loss.backward()
        optimizer.step()
    return train_loss.avg
        
def validate_cnn(model, dev_loader, criterion, optimizer, epoch, DEVICE):
    valid_loss = average_meter()
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        step = 0
        for i in dev_loader:
            x = i[0].to(DEVICE)
            y = i[1].to(DEVICE)
            l = i[2]
            pred = model.forward(x)
            loss = criterion(pred, y).to(DEVICE)
            valid_loss.update(loss.item(),x.size(0))
            pred = torch.max(pred, 1)[1]
            correct += (pred == y).float().sum()
            total += y.shape[0]
            step += 1
    return valid_loss.avg, correct/total

In [38]:
class Text_CNN(nn.Module):
    
    def __init__(self):
        super(Text_CNN, self).__init__()
        filter_sizes = [3,5,7,8]
        num_filters = 256
        n_classes = 7
        self.convs1 = nn.ModuleList([nn.Conv1d(1, num_filters, (K, 201)) for K in filter_sizes])
        self.dropout = nn.Dropout(0.1)
        self.tanh = nn.Tanh()
        self.fc1 = nn.Linear(len(filter_sizes)*num_filters, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, n_classes)
        
    def forward(self, x):
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = torch.cat(x, 1)
        x = self.dropout(x)
        x = self.fc1(x) 
        x = self.tanh(x)
        x = self.fc2(x) 
        x = self.tanh(x)
        output = self.fc3(x) 
        return output

In [39]:
print('preparing Model')
model = Text_CNN().to(DEVICE)
print('setingt optimization method')
#optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
criterion = nn.CrossEntropyLoss()

best_acc = 0
acc_model = None

for epoch in range(n_epochs):
    torch.cuda.empty_cache()
    train_loss = train_cnn(model, train_dataloader, criterion, optimizer, epoch, DEVICE)
    torch.cuda.empty_cache()
    scheduler.step()
    valid_loss, valid_acc = validate_cnn(model, dev_dataloader, criterion, optimizer, epoch, DEVICE)
    if valid_acc > best_acc:
        acc_model = deepcopy(model)
        best_acc = valid_acc
    print("{}%, Epoch {}, train loss {:.8f}, validate loss {:.8f}, acc {:.4f}".format(100*(epoch+1)/n_epochs,str(epoch).zfill(2),train_loss,valid_loss,valid_acc))

preparing Model
setingt optimization method


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


5.0%, Epoch 00, train loss 1.40038791, validate loss 1.30254424, acc 0.5320
10.0%, Epoch 01, train loss 0.79001944, validate loss 1.42092471, acc 0.5400
15.0%, Epoch 02, train loss 0.37258132, validate loss 1.58403343, acc 0.5600
20.0%, Epoch 03, train loss 0.22670102, validate loss 1.71302145, acc 0.5640
25.0%, Epoch 04, train loss 0.10596524, validate loss 1.72197108, acc 0.6000
30.0%, Epoch 05, train loss 0.05746516, validate loss 1.67223561, acc 0.6160
35.0%, Epoch 06, train loss 0.03412827, validate loss 1.87733748, acc 0.6020
40.0%, Epoch 07, train loss 0.02110620, validate loss 1.82281574, acc 0.6080
45.0%, Epoch 08, train loss 0.01592538, validate loss 1.89111884, acc 0.6100
50.0%, Epoch 09, train loss 0.01294396, validate loss 1.91849159, acc 0.6100
55.0%, Epoch 10, train loss 0.01207913, validate loss 1.93716675, acc 0.6080
60.0%, Epoch 11, train loss 0.01093729, validate loss 1.95263207, acc 0.6180
65.0%, Epoch 12, train loss 0.01054615, validate loss 1.94655485, acc 0.6040


In [40]:
def train_lstm(model, train_loader, criterion, optimizer, epoch, DEVICE):
    train_loss = average_meter()
    model.train()
    for x,y,l in train_loader:
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        pred = model.forward(x)
        
        optimizer.zero_grad()
        loss = criterion(pred, y).to(DEVICE)
        train_loss.update(loss.item(),x.size(0))
        loss.backward()
        optimizer.step()
    return train_loss.avg
        
def validate_lstm(model, dev_loader, criterion, optimizer, epoch, DEVICE):
    valid_loss = average_meter()
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        step = 0
        for x,y,l in dev_loader:
            x = x.to(DEVICE)
            y = y.to(DEVICE)
            pred = model.forward(x)
            loss = criterion(pred, y).to(DEVICE)
            valid_loss.update(loss.item(),x.size(0))
            pred = torch.max(pred, 1)[1]
            correct += (pred == y).float().sum()
            total += y.shape[0]
            step += 1
    return valid_loss.avg, correct/total

In [41]:
class Text_LSTM(nn.Module):
    def __init__(self):
        super(Text_LSTM, self).__init__()

        self.lstm = nn.LSTM(input_size=201,hidden_size=128,num_layers=2,
                            batch_first=True,bidirectional=True,dropout=0.5)
        self.fc1 = nn.Linear(128 * 2, 64)
        self.fc2 = nn.Linear(64, 7)
        self.tanh = nn.Tanh()
        self.relu = F.relu
        self.softmax = F.log_softmax


    def forward(self, x):
        """
        input x: [batch_size, seq_len, 301]
        After lstm, x:[batch_size, max_len, 2 * hidden_size]
        h_n,c_n:[2*num_layers,batch_size,hidden_size]
        """
        x, (h_n, c_n) = self.lstm(x)

        # Get the last output in both directions for concat
        output_fw = h_n[-2,:,:] # forward's last output
        output_bw = h_n[-1,:,:] # backward's last output
        output = torch.cat([output_fw,output_bw],dim=-1)

        out = self.fc1(output)
        out = self.tanh(out)
        out = self.fc2(out)
        out = self.softmax(out,dim=-1)
        return out

In [42]:
print('preparing Model')
model = Text_LSTM().to(DEVICE)
print('setingt optimization method')
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
criterion = nn.CrossEntropyLoss()

best_acc = 0
acc_model = None

for epoch in range(n_epochs):
    torch.cuda.empty_cache()
    train_loss = train_lstm(model, train_dataloader, criterion, optimizer, epoch, DEVICE)
    torch.cuda.empty_cache()
    scheduler.step()
    valid_loss, valid_acc = validate_lstm(model, dev_dataloader, criterion, optimizer, epoch, DEVICE)
    if valid_acc > best_acc:
        acc_model = deepcopy(model)
        best_acc = valid_acc
    print("{}%, Epoch {}, train loss {:.8f}, validate loss {:.8f}, acc {:.4f}".format(100*(epoch+1)/n_epochs,str(epoch).zfill(2),train_loss,valid_loss,valid_acc))

preparing Model
setingt optimization method
5.0%, Epoch 00, train loss 1.55199053, validate loss 1.42535348, acc 0.4600
10.0%, Epoch 01, train loss 1.19342085, validate loss 1.22011291, acc 0.5620
15.0%, Epoch 02, train loss 1.04817154, validate loss 1.20364040, acc 0.5760
20.0%, Epoch 03, train loss 0.90413894, validate loss 1.23944250, acc 0.5860
25.0%, Epoch 04, train loss 0.79814343, validate loss 1.24439689, acc 0.5740
30.0%, Epoch 05, train loss 0.70340649, validate loss 1.27905473, acc 0.5980
35.0%, Epoch 06, train loss 0.62085598, validate loss 1.33950342, acc 0.6040
40.0%, Epoch 07, train loss 0.52250162, validate loss 1.42534290, acc 0.6160
45.0%, Epoch 08, train loss 0.43869680, validate loss 1.60194561, acc 0.5900
50.0%, Epoch 09, train loss 0.36563341, validate loss 1.57683130, acc 0.6060
55.0%, Epoch 10, train loss 0.31148861, validate loss 1.74726898, acc 0.6100
60.0%, Epoch 11, train loss 0.27279553, validate loss 1.71855588, acc 0.6140
65.0%, Epoch 12, train loss 0.222