# Training Process of My Model

In [1]:
### import modules
import math
import numpy as np
import random
import pickle
from torch import nn, Tensor
import torch.optim as optim
import torch.utils.data as data
import torch

In [2]:
### read the words dictionary data -- print(Number of the total words, Max length of the words)
words_list = []
max_len = 0
with open("words_250000_train.txt") as f:
    while True:
        l = f.readline() 
        if not l:
            break
        if len(l.strip()) > max_len:
            max_len = len(l.strip())
        words_list.append(l.strip())    
print(len(words_list), max_len)

227300 29


In [3]:
### some statistics
from collections import Counter
a = Counter("".join(words_list))
freq_counter = dict(a)

In [4]:
most = Counter("".join(words_list)).most_common()
most_common_six = [item[0] for item in most[:6]]
most_common_six

['e', 'i', 'a', 'n', 'o', 'r']

In [5]:
## split the words_list into 2 parts -- Otherwise, the created instances of whole dictionaries is too large to put in the memory
np.random.seed(42)
random.shuffle(words_list)
words_list_part1 = words_list[:110000]
words_list_part2 = words_list[110000:]

## Prepare the dataloader

In [6]:
## Create the dictionaties to transform inputs and outputs of the attention model
## source vocabulary: change the word "appl_|____" to a input tensor [1,16,16,0,27,0,0,0,0] 
src_vocab = {chr(ord('a')+i):(i+1) for i in range(26)}
src_vocab["|"] = 27
src_vocab["_"] = 0
src_idx2letter = {src_vocab[key]:key for key in src_vocab.keys()}

## target vocabulary: change the target letters "a" to a output classification tensor [0,1,0,0,0,0,0,0]
tgt_vocab = {chr(ord('a')+i):(i+1) for i in range(26)}
tgt_vocab["end"] = 0
tgt_idx2letter = {tgt_vocab[key]:key for key in tgt_vocab.keys()}

In [7]:
## parameters of the attention model
vocab_size = len(src_vocab)    ##  end  + 26 letters + _
target_size = len(tgt_vocab)   ## 26 letters + end
d_model = 16                   ## embedding dimension
max_len = 32                   ## length of the input tensor
batch_size = 128 
d_k = d_v = 16                 ## dimention of key, query and value in the attention layer
n_heads = 2                    ## number of attention heads
d_ff = 32                      ## dimention of the feedforward layer
n_layers = 3                  ## number of attetion layers
epochs = 10

#### Firstly, generate training samples: mask certain letters in a word to make the inputs, let the masked letter to be the outpus 

Those small sets get all subsets, large sets get random subsets

In [8]:
### create the instances to learn, like 【input:__pple, output: [a]】
def rd_subset(aset):
    '''
    Give a (random) non-empty subsets of a given set 
    
    If len(aset) <= 8, return all the subsets.
    Else, return a maximum of 256 subsets.
    
    Example: rd_subset(['e', 'i', 'a']) --> [['e', 'i', 'a'], ['e', 'a'], ['e', 'i'], ['a'], ['i'], ['e'], ['i', 'a']]
    
    '''
    
    n = len(aset)
    subsets = []
    m = min(2**n-1,256)
    samples = random.sample(range(1,2**n),m)
    for i in samples:
        subs = []
        s = str(bin(i))[2:].zfill(n)
        for j in range(len(s)):
            if s[j] == '1':
                subs.append(aset[j])
        if (len(subs) == 1) and (subs[0] not in most_common_six):
            continue
        subsets.append(subs)
    return subsets

def create_instance(word):
    '''
    Create instances to learn from a given word.
    
    Example: "apple"
    
    Inputs:       Outputs: (multi-label)
    "a____"        ["p","l","e"]
    "app__"        ["l", "e"]
    "_pple"        ["a"]
    "apple"        ["end"]
    ...
    
    '''
    inputs = [word]
    outputs = [["end"]]
    candidates = list(set(word))
    n = len(candidates)

    total_subsets = rd_subset(candidates)
    for subguessed in total_subsets:
        newword = "".join([c if c in subguessed else "_" for c in word])
        inputs.append(newword)
        outputs.append(list(set(candidates) - set(subguessed)))

    ## strenghten on those with one letters not guessed
    if n > 5:
        for drop in candidates:
            newword = "".join([c if c != drop else "_" for c in word])
            inputs.append(newword)
            outputs.append([drop])

    return inputs, outputs

def generate(words_list):
    '''
    Generate instances from the words_list
    '''
    src_data = []
    tgt_data = []
    for i in range(len(words_list)):
        if i % 20000 == 0:
            print("Current process:", i ,"/", len(words_list)-1)
        inp,out = create_instance(words_list[i])
        src_data.extend(inp)
        tgt_data.extend(out)
    return src_data, tgt_data

In [12]:
create_instance("apple")
most_common_six
["_pp__","___l_"]

['e', 'i', 'a', 'n', 'o', 'r']

In [16]:
## 3-gram word
s = "apple"
def split(word,n):
    ngram_lists = [word[i:(i+n)] for i in range(len(word)-n+1)]
    return ngram_lists

total_ngrams = []
for w in words_list:
    total_ngrams.extend(split(w,3))
Counter(total_ngrams).most_common

<bound method Counter.most_common of Counter({'ing': 15154, 'ess': 8423, 'ati': 8072, 'ter': 7912, 'ion': 7691, 'nes': 6804, 'ate': 6552, 'ent': 6454, 'tio': 6191, 'ous': 5647, 'tic': 5247, 'ica': 4959, 'ist': 4903, 'all': 4759, 'ine': 4634, 'ant': 4548, 'ers': 4492, 'cal': 4430, 'ble': 4421, 'per': 4369, 'tin': 4341, 'ver': 4216, 'non': 4188, 'ted': 4118, 'eri': 4050, 'lin': 3770, 'ali': 3714, 'tra': 3625, 'pre': 3608, 'abl': 3607, 'con': 3548, 'nte': 3429, 'est': 3398, 'sti': 3384, 'res': 3357, 'ell': 3347, 'pro': 3338, 'ene': 3262, 'ive': 3243, 'ste': 3207, 'rin': 3190, 'nti': 3163, 'rat': 3145, 'tri': 3120, 'ill': 3112, 'men': 3097, 'red': 3084, 'oni': 3077, 'nde': 3067, 'ies': 3065, 'ere': 3025, 'ari': 3006, 'the': 2988, 'ite': 2985, 'der': 2939, 'ove': 2926, 'man': 2913, 'tor': 2907, 'lat': 2884, 'ran': 2864, 'ian': 2854, 'lly': 2846, 'iti': 2836, 'lit': 2829, 'str': 2791, 'int': 2760, 'era': 2754, 'les': 2703, 'and': 2651, 'ize': 2648, 'her': 2646, 'ast': 2637, 'gra': 2591, 'ons

In [None]:
## Create the dictionaties to transform inputs and outputs of the attention model
## source vocabulary: change the word "appl_|____" to a input tensor [1,16,16,0,27,0,0,0,0] 
src_vocab = {chr(ord('a')+i):(i+1) for i in range(26)}
src_vocab["|"] = 27
src_vocab["_"] = 0
src_idx2letter = {src_vocab[key]:key for key in src_vocab.keys()}

## target vocabulary: change the target letters "a" to a output classification tensor [0,1,0,0,0,0,0,0]
tgt_vocab = {chr(ord('a')+i):(i+1) for i in range(26)}
tgt_vocab["end"] = 0
tgt_idx2letter = {tgt_vocab[key]:key for key in tgt_vocab.keys()}

In [11]:
src_data, tgt_data = generate(words_list_part1)
print(len(src_data))

## change the input word "appl_" to a input tensor [1,16,16,0,27,0,0,0,0,0,...] 
inputs = []
for i in range(len(src_data)):
    x = src_data[i] + "|"
    x = x.ljust(max_len, "_")
    ip = [src_vocab[c] for c in x]
    inputs.append(ip)
del src_data

## save inputs
inputs = torch.LongTensor(inputs)
with open("instances_part1_ip.pkl", "wb") as f:
    pickle.dump(inputs, f)
del inputs
print("successfully save the inputs data!")

#change the target lables ["a","b"] to a output classification tensor [0,1,1,0,0,0,0,0,...]
outputs = []
tgt_size = len(tgt_vocab)
for j in range(len(tgt_data)):
    tgts = tgt_data[j]
    y = [0] * tgt_size
    for t in tgts:
        y[tgt_vocab[t]] = 1
    outputs.append(y)
del tgt_data

## save outputs
outputs = torch.LongTensor(outputs)
with open("instances_part1_op.pkl", "wb") as f:
    pickle.dump(outputs, f)
del outputs
print("successfully save the outputs data!")

Current process: 0 / 109999
Current process: 20000 / 109999
Current process: 40000 / 109999
Current process: 60000 / 109999
Current process: 80000 / 109999
Current process: 100000 / 109999
18070665
successfully save the inputs data!
successfully save the outputs data!


#### Load the training data

In [25]:
with open("instances_part2_ip.pkl", "rb") as f:
    inputs = pickle.load(f)
with open("instances_part2_op.pkl", "rb") as f:
    outputs = pickle.load(f)

### make the dataloader
class TrainData(data.Dataset):
    def __init__(self, model_inputs, model_outputs):
        super(TrainData, self).__init__()
        self.ip = model_inputs
        self.op = torch.Tensor.float(model_outputs)
    
    def __len__(self):
        return self.ip.shape[0]
    
    def __getitem__(self, idx):
        return self.ip[idx], self.op[idx]
    
loader = data.DataLoader(TrainData(inputs, outputs), batch_size, shuffle = True)    

In [26]:
print(inputs.shape)
print(outputs.shape)

torch.Size([19250151, 32])
torch.Size([19250151, 27])


### Prepare the Network


In [15]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer

### add mask -- which is _ in the words
def get_attn_pad_mask(datald):
    batch_size, len_q = datald.size()
    pad_attn_mask = datald.data.eq(0).unsqueeze(1)
    return pad_attn_mask.expand(batch_size, len_q, len_q).cuda()

## add position
class PositionalEncoding(nn.Module):
    def __init__(self, d_model,max_len, dropout=0):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout) 
        
        pos_table = np.array([[pos / np.power(10000, 2 * i / d_model) for i in range(d_model)]
                                if pos != 0 else np.zeros(d_model) for pos in range(max_len)])
        pos_table[1:, 0::2] = np.sin(pos_table[1:, 0::2])                  
        pos_table[1:, 1::2] = np.cos(pos_table[1:, 1::2])                  
        self.pos_table = torch.FloatTensor(pos_table).cuda()            

    def forward(self, x):                                       
        x += self.pos_table[:x.size(1), :]
        return self.dropout(x.cuda())
    
### Attention modules
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()
    
    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1,-2)) / np.sqrt(d_k)
        scores.masked_fill_(attn_mask, -1e9)
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn,V)
        return context

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model,d_k * n_heads, bias = False)
        self.W_K = nn.Linear(d_model,d_k * n_heads, bias = False)
        self.W_V = nn.Linear(d_model,d_v * n_heads, bias = False)
        self.fc = nn.Linear(n_heads * d_v, d_model, bias = False)
        
    def forward(self, input_Q, input_K, input_V, attn_mask):
        residual, batch_size = input_Q, input_Q.size(0)
        Q = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        K = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        V = self.W_V(input_V).view(batch_size, -1, n_heads, d_v).transpose(1,2)
        attn_mask = attn_mask.unsqueeze(1).repeat(1,n_heads,1,1)
        context = ScaledDotProductAttention()(Q,K,V,attn_mask)
        
        context = context.transpose(1,2).reshape(batch_size,-1,n_heads * d_v)
        output = self.fc(context)
        return nn.LayerNorm(d_model).cuda()(output + residual)
    
## feed forward module
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_ff, bias = False),
            nn.ReLU(),
            nn.Linear(d_ff, d_model, bias = False)
        )
    
    def forward(self, inputs):
        residual = inputs 
        output = self.fc(inputs)
        return nn.LayerNorm(d_model).cuda()(output + residual)

### Encoder layer
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()
        
    def forward(self, inputs,self_attn_mask):
        outputs = self.self_attn(inputs,inputs,inputs,self_attn_mask)
        outputs = self.pos_ffn(outputs)
        return outputs
    
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        
    def forward(self, inputs):
        outputs = self.src_emb(inputs)
        outputs = self.pos_emb(outputs)
        self_attn_mask = get_attn_pad_mask(inputs) ## inputs is from a data loader
        for layer in self.layers:
            outputs = layer(outputs, self_attn_mask)
        return outputs    

In [16]:
### my model
class Mymodel(nn.Module):
    def __init__(self):
        super(Mymodel, self).__init__()
        self.enc_model = Encoder().cuda()
        self.proj = nn.Sequential(
                    ## flattened inputs
                    nn.Linear(d_model * max_len,128),
                    nn.LeakyReLU(),
                    nn.Linear(128,64),
                    nn.LeakyReLU(),
                    ## output layer
                    nn.Linear(64, target_size),
                    nn.LeakyReLU()
        ).cuda()
        #self.softmax = nn.Softmax(dim = 1)
        
    def forward(self, inputs):
        outputs = self.enc_model(inputs)
        outputs = torch.flatten(outputs,start_dim = 1)
        outputs = self.proj(outputs)
        logits = torch.sigmoid(outputs)
        return logits

### Train the model!

In [18]:
mymodel = Mymodel().cuda()
mymodel.load_state_dict(torch.load("mymodel_weights_latest2.pth")) ## load the current model parameters
mymodel.eval()

In [27]:
### Train the model!
criterion = nn.BCELoss()
optimizer = optim.Adam(mymodel.parameters(), lr = 0.001)  #SGD(mymodel.parameters(), lr = 0.002, momentum = 0.99)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

for epoch in range(8):
    i = 0
    for ip, op in loader:
        ip, op = ip.cuda(), op.cuda()
        pred_op = mymodel(ip)
        loss = criterion(pred_op, op)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        i +=1
        if i % 2000 == 1:
            print("Epoch:", '%04d'%(epoch+1),"Batch: %d"%i, "Loss = ", "%.4f"%loss)
        ## save the model
        if i % 50000 == 0:
            torch.save(mymodel.state_dict(), "mymodel_weights_latest2.pth")
    scheduler.step()

Epoch: 0001 Batch: 1 Loss =  0.2952
Epoch: 0001 Batch: 2001 Loss =  0.2701
Epoch: 0001 Batch: 4001 Loss =  0.3003
Epoch: 0001 Batch: 6001 Loss =  0.3134
Epoch: 0001 Batch: 8001 Loss =  0.2825
Epoch: 0001 Batch: 10001 Loss =  0.2821
Epoch: 0001 Batch: 12001 Loss =  0.2874
Epoch: 0001 Batch: 14001 Loss =  0.3010
Epoch: 0001 Batch: 16001 Loss =  0.2732
Epoch: 0001 Batch: 18001 Loss =  0.2647
Epoch: 0001 Batch: 20001 Loss =  0.2904
Epoch: 0001 Batch: 22001 Loss =  0.2766
Epoch: 0001 Batch: 24001 Loss =  0.2905
Epoch: 0001 Batch: 26001 Loss =  0.2838
Epoch: 0001 Batch: 28001 Loss =  0.3145
Epoch: 0001 Batch: 30001 Loss =  0.2734
Epoch: 0001 Batch: 32001 Loss =  0.3068
Epoch: 0001 Batch: 34001 Loss =  0.2878
Epoch: 0001 Batch: 36001 Loss =  0.2639
Epoch: 0001 Batch: 38001 Loss =  0.2739
Epoch: 0001 Batch: 40001 Loss =  0.2753
Epoch: 0001 Batch: 42001 Loss =  0.2719
Epoch: 0001 Batch: 44001 Loss =  0.2655
Epoch: 0001 Batch: 46001 Loss =  0.2593
Epoch: 0001 Batch: 48001 Loss =  0.2768
Epoch: 0

In [None]:
## save the current model
torch.save(mymodel.state_dict(), "mymodel2_weights_train_v2.pth")

## Test the result


In [22]:
def transform_input(x):
    x = x + "|"
    x = x.ljust(max_len, "_")
    ip = [src_vocab[c] for c in x]
    return torch.LongTensor(ip)

def returnAGuess(candidates, guessed, mymodel):
    test_case = transform_input(guessed).cuda()
    test_case.unsqueeze_(dim = 0)
    logits = mymodel(test_case)[0]
    ans = None
    maxx = 0
    for c in candidates:
        logit = logits[tgt_vocab[c]]
        if logit > maxx:
            maxx = logit
            ans = c
    return ans
       
def writeMyGame(theWord, print_ = False):
    n = len(theWord)
    guessed = "_"* n
    toguess = list(set(theWord))
    
    candidates = [chr(ord('a')+i) for i in range(26)]
    count = 0
    
    while guessed != theWord:
        g = returnAGuess(candidates, guessed, mymodel)
        if print_:
            print("/n==== Guess: %s"%g)
        candidates.remove(g)
        if g in toguess:
            toguess.remove(g)
            guessed = "".join(["_" if x in toguess else x for x in theWord])
            if print_:
                print("Right Guess, updated:",guessed)
        else:
            count += 1
            if print_:
                print("Wrong Guess, keep guessing.")
    
    return count  

In [25]:
writeMyGame("restroom", True)

/n==== Guess: e
Right Guess, updated: _e______
/n==== Guess: a
Wrong Guess, keep guessing.
/n==== Guess: i
Wrong Guess, keep guessing.
/n==== Guess: r
Right Guess, updated: re__r___
/n==== Guess: n
Wrong Guess, keep guessing.
/n==== Guess: o
Right Guess, updated: re__roo_
/n==== Guess: t
Right Guess, updated: re_troo_
/n==== Guess: s
Right Guess, updated: restroo_
/n==== Guess: d
Wrong Guess, keep guessing.
/n==== Guess: w
Wrong Guess, keep guessing.
/n==== Guess: p
Wrong Guess, keep guessing.
/n==== Guess: c
Wrong Guess, keep guessing.
/n==== Guess: g
Wrong Guess, keep guessing.
/n==== Guess: v
Wrong Guess, keep guessing.
/n==== Guess: b
Wrong Guess, keep guessing.
/n==== Guess: f
Wrong Guess, keep guessing.
/n==== Guess: u
Wrong Guess, keep guessing.
/n==== Guess: m
Right Guess, updated: restroom


12