### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [109]:
train, dev, test = [], [], []

In [110]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))
print (train[:3])

16220
[[0, 'lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c .'], [0, '6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy .'], [0, 'tx lktc kw8t6jú öw2tê tc coééwcê6 lw Úê j ytxl , cwöêÚw6ú oóü#jééê6 tl êj#8ú , lwwm wol j88 lkê yww6 cloxx , jó6 8êxl Úêktó6 lkê á#jé ( 8tlê#j88ú ) .']]


In [111]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))
print (dev[:3])

2027
[[1, 'ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<'], [0, 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l .'], [1, 'üt88 jcco#ê68ú #jóm jc wóê wx lkê á8ê2ê#êcl , öwcl 6êáêélt2ê8ú jöoctóy áwöê6têc wx lkê úêj# .']]


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [112]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))
print (test[:3])

2028
['j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú .', 'ówlktóy cltámc , #êj88ú , ê+áêél j 8tóyê#tóy á#êêétóêcc wóê xêê8c x#wö Úêtóy 6#jyyê6 lk#woyk j cj6 , cw#6t6 oót2ê#cê wx yoóc , 6#oyc , j2j#táê jó6 6jöjyê6 6#êjöc .', 'öo#ékú jó6 üt8cwó jáloj88ú öjmê j é#êllú yww6 lêjö <<< Úol lkê é#wvêál co##woó6tóy lkêö tc 6tcl#êcctóy8ú #wlê .']


#### You can split every sentence into lists of words by white spaces.

In [113]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [x.split(' ') for x in test]
print(train_split[1])
print(len(train_split))
print(test_split[0])

[0, ['6êcétlê', 'jolêot8', 'zc', 'éê#xw#öjóáê', ',', 'tl', 'zc', 'j', '#jlkê#', '8tcl8êcc', 'jöÚ8ê', '6wüó', 'lkê', 'öt668ê', 'wx', 'lkê', '#wj6', ',', 'ükê#ê', 'lkê', 'lkêöjltá', 't#wótêc', 'j#ê', 'lww', 'wÚ2twoc', 'jó6', 'lkê', 'cê+oj8', 'éw8tltác', 'lww', 'cöoy', '.']]
16220
['j', '6t6jáltá', 'jó6', '6o88', '6wáoöêólj#ú', 'y8w#txútóy', 'cwxlüj#ê', 'jój#ákú', '.']


### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [114]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

In [115]:
from gensim.models import word2vec
import os

def train_word2vec(x):
    model = word2vec.Word2Vec(x, vector_size=250, window=5, min_count=5, workers=12, epochs=10, sg=1)
    return model

def parse_data_with_label(data_list):
    data = list()
    label = list()
    for raw_data in data_list:
        label.append(raw_data[0])
        data.append(raw_data[1])
    return data,label

train_data,train_label = parse_data_with_label(train_split)

dev_data, dev_label = parse_data_with_label(dev_split)

test_data = test_split

model = train_word2vec(train_data+dev_data+test_split)

model.save("word2vec.model")

print(len(train_data))


16220


In [116]:
from torch import nn
from gensim.models import word2vec
class Preprocess():
    def __init__(self, sentences, sen_len, path="./word2vec.model"):
        self.path = path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = list()
        self.word2idx = dict()
        self.embedding_matrix = list()
        
    def get_model(self):
        self.embedding = word2vec.Word2Vec.load(self.path)
        self.embedding_dim = self.embedding.vector_size
        
    def add_embedding(self, word):
        vector = torch.empty(1,self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix,vector],0)
        
    def make_embedding(self):
        self.get_model()
        for i,word in enumerate(self.embedding.wv.key_to_index):
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding.wv[word])
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding("")
        return self.embedding_matrix
    
    def pad_sentence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for i in range(pad_len):
                sentence.append(self.word2idx[""])
        assert len(sentence) == self.sen_len
        return sentence
    
    def sentence_word2idx(self):
        sen_list = list()
        for i, sen in enumerate(self.sentences):
            sentence_idx = list()
            for word in sen:
                if word in self.word2idx.keys():
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx[""])
            sentence_idx = self.pad_sentence(sentence_idx)
            sen_list.append(sentence_idx)
        return torch.LongTensor(sen_list)
    
    def labels_to_tensor(self, y):
        y = [int(label) for label in y]
        return torch.LongTensor(y)
    

In [117]:
from torch.utils import data

class CiperDataset(data.Dataset):
    
    def __init__(self,X,y):
        self.data = X
        self.label = y
        
    def __getitem__(self, idx):
        if self.label is None:
            return self.data[idx]
        return self.data[idx], self.label[idx]
    
    def __len__(self):
        return len(self.data)

In [118]:
import torch
from torch import nn

# class LSTM_Net(nn.Module):
#     def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout = 0.5, fix_embedding = True):
#         super(LSTM_Net, self).__init__()
#         self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
#         self.embedding_weight = torch.nn.Parameter(embedding)
#         self.embedding.weight.requires_grad = False if fix_embedding else True
#         self.embedding_dim = embedding.size(1)
#         self.hidden_dim = hidden_dim
#         self.num_layers = num_layers
#         self.dropout = dropout
#         self.lstm = nn.LSTM(embedding_dim, 
#                             hidden_dim, 
#                             num_layers = num_layers, 
#                             batch_first = True)
#         self.classifier = nn.Sequential( nn.Dropout(dropout),
#                                          nn.Linear(hidden_dim, 1),
#                                          nn.Sigmoid() )
#     def forward(self, inputs):
#         inputs = self.embedding(inputs)
#         x, _ = self.lstm(inputs, None)
#         x = x[:, -1, :] 
#         x = self.classifier(x)
#         return x

class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, n_layers,
                 bidirectional, dropout, fix_embedding = True):
        super(LSTM_Net, self).__init__()
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        #self.fc1 = nn.Linear(hidden_dim1 * 2, hidden_dim2)
        self.fc2 = nn.Linear(hidden_dim*2, 1)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        inputs = self.embedding(inputs)

        packed_output, (hidden, cell) = self.lstm(inputs)
        
        cat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        drop = self.dropout(cat)
        #dense1 = self.fc1(drop)
        preds = self.fc2(drop)
        rel = self.sigmoid(preds)
        return rel
    

In [119]:
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
    
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    model.train()
    criterion = nn.BCELoss()
    t_batch = len(train) 
    v_batch = len(valid) 
    optimizer = optim.Adam(model.parameters(), lr=lr)
    total_loss, total_acc, best_acc = 0, 0, 0
    for epoch in range(n_epoch):
        total_loss, total_acc = 0, 0
        for i, (inputs, labels) in enumerate(train):
            inputs = inputs.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            correct = evaluation(outputs, labels)
            total_acc += (correct / batch_size)
            total_loss += loss.item()
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
                epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print('\n Train | Loss:{:.3f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))
        
        #validation
        model.eval()
        print("valid"+str(v_batch))
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long) 
                labels = labels.to(device, dtype=torch.float)  
                outputs = model(inputs) 
                outputs = outputs.squeeze() 
                loss = criterion(outputs, labels) 
                correct = evaluation(outputs, labels) 
                total_acc += (correct / batch_size)
                total_loss += loss.item()
                
            print("Valid | Loss:{:.3f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            
            if total_acc > best_acc:
                best_acc = total_acc
                torch.save(model, "{}/ctc.model".format(model_dir))
                
        print('-----------------------------------------')
        model.train()

In [120]:
def testing(batch_size, test_loader, model, device):
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs>=0.5] = 1
            outputs[outputs<0.5] = 0
            ret_output += outputs.int().tolist()
    
    return ret_output

In [121]:
import os
import torch
from torch import nn
from gensim.models import word2vec

def evaluation(outputs,labels):
    outputs[outputs>=0.5] = 1
    outputs[outputs<0.5] = 0
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

device = torch.device("cpu")

sen_len = 30
fix_embedding = True # fix embedding during training
batch_size = 64
epoch = 5
lr = 0.001

model_dir = "./"

print(len(train_data))

preprocess = Preprocess(train_data+dev_data, sen_len, path="./word2vec.model")
embedding = preprocess.make_embedding()

data = preprocess.sentence_word2idx()
label = preprocess.labels_to_tensor(train_label+dev_label)

print(len(data))

#model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=250, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=250,n_layers=3, bidirectional=True, dropout=0.5, fix_embedding=fix_embedding)
model = model.to(device)

X_train, X_dev, y_train, y_dev = data[:16220], data[16220:], label[:16220], label[16220:]

print()

train_dataset = CiperDataset(X=X_train, y=y_train)
dev_dataset = CiperDataset(X=X_dev, y=y_dev)

print(len(train_dataset))
print(len(dev_dataset))

train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            num_workers = 0,
                                            drop_last = True)

dev_loader = torch.utils.data.DataLoader(dataset = dev_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 0,
                                            drop_last = True)

print(len(train_loader))
print(len(dev_loader))

training(batch_size, epoch, lr, model_dir, train_loader, dev_loader, model, device)


16220
18247

16220
2027
253
31
[ Epoch1: 253/253 ] loss:0.560 acc:65.625 
 Train | Loss:0.606 Acc: 66.360
valid31
Valid | Loss:0.514 Acc: 74.345 
-----------------------------------------
[ Epoch2: 253/253 ] loss:0.378 acc:84.375 
 Train | Loss:0.417 Acc: 81.874
valid31
Valid | Loss:0.434 Acc: 80.393 
-----------------------------------------
[ Epoch3: 253/253 ] loss:0.323 acc:87.500 
 Train | Loss:0.273 Acc: 89.495
valid31
Valid | Loss:0.374 Acc: 84.829 
-----------------------------------------
[ Epoch4: 253/253 ] loss:0.217 acc:92.188  
 Train | Loss:0.180 Acc: 93.423
valid31
Valid | Loss:0.381 Acc: 85.988 
-----------------------------------------
[ Epoch5: 253/253 ] loss:0.158 acc:95.312  
 Train | Loss:0.121 Acc: 95.695
valid31
Valid | Loss:0.425 Acc: 87.399 
-----------------------------------------


In [122]:
preprocess = Preprocess(test_data, sen_len, path="./word2vec.model")
embedding = preprocess.make_embedding()

test_data = preprocess.sentence_word2idx()

test_dataset = CiperDataset(X=test_data, y=None)

test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 0)

model = torch.load("./ctc.model")

results = testing(batch_size, test_loader, model, device)

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [106]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [107]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [108]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')