In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import numpy as np
import pickle as pkl
import copy

# Data reading, normalization and padding
1. read collected data
2. normalization and padding them into the same size
3. convert data to torch.tensor objects and send them to the device
4. split the data into training set (70%) and testing set (30%)

In [2]:
# read data collected
f = open("data for training and testing/XYW_read data.pkl", 'rb')
X_sentence, Y_sentence, W_sentence =pkl.load(f)
f.close()

In [3]:
# padding and normalization
Azure_vowels_list = ["iy", "ih", "ey", "eh", "ae", "aa", "ao", "uh", "ow", "uw", "ah", "ay", "aw", "oy", "ax", "er"]
type_to_code = {Azure_vowels_list[i]:i+1 for i in range(len(Azure_vowels_list))}
type_to_code["none"] = 0

# normalize and pad X_sentence, Y_sentence, W_sentence to a one with the length of 17
factor_length = 12 # number of numerical features
X_trans_P = []
X_type_P = []
X_type_Pcode= []
Y_trans_P = []
W_trans_P = []
for sentence_ind in range(len(X_sentence)):
    tp_sentence_feature = X_sentence[sentence_ind]
    tp_sentence_stress = Y_sentence[sentence_ind]
    tp_sentence_weight = W_sentence[sentence_ind]
    # get the average syllable- and nuclei- level features for each sentence
    tp_sentence_features = []
    for word_ind in range(len(X_sentence[sentence_ind])):
        # i is the index of syllable
        tp_word_features = [X_sentence[sentence_ind][word_ind][i][:factor_length] for i in range(len(X_sentence[sentence_ind][word_ind]))]
        tp_sentence_features.extend(tp_word_features)
    tp_sentence_average = np.average(tp_sentence_features, axis=0)
    # tp_sentence_average[tp_sentence_average==0] = 1 # make the zeros to be one
    
    for word_ind in range(len(X_sentence[sentence_ind])):
        X_trans_P.append([np.array(X_sentence[sentence_ind][word_ind][i][:factor_length])-tp_sentence_average for i in range(len(X_sentence[sentence_ind][word_ind]))])
        X_type_P.append([X_sentence[sentence_ind][word_ind][i][-1] for i in range(len(X_sentence[sentence_ind][word_ind]))])
        Y_trans_P.append([Y_sentence[sentence_ind][word_ind][i] for i in range(len(Y_sentence[sentence_ind][word_ind]))])
        W_trans_P.append([W_sentence[sentence_ind][word_ind][i] for i in range(len(W_sentence[sentence_ind][word_ind]))])

seq_l = []
for i in range(len(X_trans_P)):
    seq_l.append(len(X_trans_P[i]))
    tp_pad_type = ["none"]*(17-len(X_trans_P[i]))
    tp_pad_x = [[0]*12]*(17-len(X_trans_P[i]))
    tp_pad_y = [0]*(17-len(X_trans_P[i]))
    X_trans_P[i] = X_trans_P[i]+tp_pad_x
    Y_trans_P[i] = Y_trans_P[i]+tp_pad_y
    W_trans_P[i] = W_trans_P[i]+tp_pad_y
    
    X_type_P[i] = X_type_P[i] + tp_pad_type
    
    X_type_Pcode.append([type_to_code[X_type_P[i][j]] for j in range(len(X_type_P[i]))])

In [4]:
# convert data to torch.tensor objects and send them to the device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

X_feature = torch.tensor(np.array(X_trans_P)).float().to(device)
X_type = torch.tensor(X_type_Pcode).long().to(device)
Y = torch.tensor(Y_trans_P).long().to(device)
W = torch.tensor(W_trans_P).float().to(device)
seq_T = torch.tensor(seq_l).long().to(device)

In [5]:
train_set_prop = 0.7
dataset = torch.utils.data.TensorDataset(X_feature, X_type, seq_T, Y, W)
train_size = int(len(dataset)*train_set_prop)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
print(len(train_dataset))

99430


# Initializing and Training the model
please specify the variables: "use_all_features", "factor_size", and "n_head", "b_features_ph", "n_layer"
1. If the model is trained using all the numerical and categorical features, let use_all_features = True. If the model is trained using only numerical features, let use_all_features = False
2. If numerical features are used only (i.e., use_all_features = False), specify the value of "factor_size" to indicate the frist n features to use. The first six features are measurements over syllable, while the next six features are measurements over nucleus. Models in the paper used factor_size=6 and factor_size=12
3. "n_head", "b_features_ph", "n_layer" indicates the number of heads, features per head, and the number of layers of the transformer encoder.

In [6]:
# specify model details
use_all_features = False # set this to True if nucleus type is used, and False if only numerical features are used
factor_size = 12 # the numebr of numerical features (the first n) to use if the model takes numerical features only

# other parameters
n_head = 5 # number of heads
b_features_ph = 6 # number of transformed features per head
n_layer = 3 # number of layers of the transformer model

In [7]:
# specify training details
max_iter = 100  # number of iterations (epochs)
step_size = 2
gamma = 1
batch_size = 200

lr = 0.001
tp_lr = lr

# initialize the model
if use_all_features==False:
    from self_attention_numerical import *
    n_embed = n_head*b_features_ph
    dropout = 0
    config = Config(vocab_size=3, n_embed=n_embed, dropout=dropout, n_layer=n_layer, block_size=17, 
                    forward_expansion=3, n_head=n_head, fctor_size=factor_size, n_type=len(type_to_code))
else:
    from self_attention_all_features import *
    n_embed = n_head*b_features_ph
    dropout = 0
    config = Config(vocab_size=3, n_embed=n_embed, dropout=dropout, n_layer=n_layer, block_size=17, 
                    forward_expansion=3, n_head=n_head, fctor_size=12, n_type=len(type_to_code))
    
model = TransModel(config).to(device)

best_model = None
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)

loss_recorder = []
tp_ct = 0
# tp_rec = []
past_records = []
tp_min_loss = 999999999
lr_count = 2

In [None]:
print("=> start training")
for epoch in range(max_iter):
    accs = []
    tp_rec = []
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    for batch_X, batch_X_type, batch_X_l, batch_Y, batch_W in train_loader:
        tp_ct += 1
        model.zero_grad()      
        if use_all_features==False: # only numerical features are used
            logits, loss, acc = model(batch_X[:, :, :factor_size], batch_X_l, batch_Y)
        else: # numerical features and nucleus type are used
            logits, loss, acc = model(batch_X, batch_X_type, batch_X_l, batch_Y, batch_W)
    
        loss.backward()
        optimizer.step()
        accs.append(acc.cpu().detach())
        loss_r = loss.cpu().detach()
        loss_recorder.append(loss_r)
        tp_rec.append(loss_r)
    tp_epoch_loss = float(sum(tp_rec)/len(tp_rec))
    print("current loss:"+str(tp_epoch_loss)+"  accuracy:"+str(float(sum(accs)/len(accs))))
    if tp_min_loss>tp_epoch_loss: # if the current loss is less than the min_loss record
        tp_min_loss = float(tp_epoch_loss)
        best_model = copy.deepcopy(model)

    scheduler.step()
    if epoch%step_size==0:
        tp_lr *= gamma
    past_records.append(tp_epoch_loss)
    # examine the losses and see if the model should be rolled-back
    if epoch>6 and past_records[-1]*(1+1e-4)>sum(past_records[-4:-1])/3 and tp_lr<1e-5:
        model = copy.deepcopy(best_model)
        tp_lr *= 10
        lr_count = 0
        # print(tp_lr)
        optimizer = torch.optim.Adam(model.parameters(), lr=tp_lr)
        scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
    else:
        lr_count += 1

=> start training


# Look at the accuracy on the testing dataset
When the nucleus features are used and weighted, there are weighted accuracy and unweighted accuracy.
set batch_W=None when you want to print unweighted accuracy.

In [None]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
loss_test=[]
acc_test = []
for batch_X, batch_X_type, batch_X_l, batch_Y, batch_W in test_loader:
    with torch.no_grad():
        if use_all_features==False: # only numerical features are used
            logits, loss, acc = model(batch_X[:, :, :factor_size], batch_X_l, batch_Y)
        else: # numerical features and nucleus type are used
            # weighted accuracy
            logits, loss, acc = model(batch_X, batch_X_type, batch_X_l, batch_Y, batch_W)
            # unweighted accuracy
#             logits, loss, acc = model(batch_X, batch_X_type, batch_X_l, batch_Y, None)
            
    loss_r = loss.cpu().detach()
    acc_test.append(acc.cpu().detach())
    loss_test.append(loss_r)
    
tp_epoch_loss = float(sum(loss_test)/len(loss_test))

print("loss:"+str(tp_epoch_loss)+"  accuracy:"+str(float(sum(acc_test)/len(acc_test))))