### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [1]:
train, dev, test = [], [], []

In [2]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
# print (len(train))
# print (train[:3])

In [3]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
# print (len(dev))
# print (dev[:3])

#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [4]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
# print (len(test))
# print (test[:3])

#### You can split every sentence into lists of words by white spaces.

In [5]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

#### Word Embedding

In [6]:
from gensim.models import Word2Vec
import gensim.downloader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
sen_list=[]
word_set=set()
word_counter=0
for sentence in train_split:
    sen_list.append(sentence[1])
    for word in sentence[1]:
        word_counter+=1
        word_set.add(word)
# print(word_counter)
# print(len(word_set))

wvmodel = Word2Vec(sen_list, min_count=1,vector_size = 100, epochs = 10)


#### Build Dataset

In [7]:
seq_len = []
label_list = []
train_dataset = []
max_len = 100
for a in train_split:
    word_embedding=[]
    label_list.append(a[0])
    seq_len.append(len(a[1]))
    for word in a[1]:
        word_embedding.append(wvmodel.wv[word])

    train_dataset.append(torch.tensor(word_embedding, dtype=torch.float))

for i in range(len(train_dataset)):
    train_dataset[i] = F.pad(train_dataset[i], pad=(0, 0, 0, max_len - train_dataset[i].shape[0]))

dev_seq_len = []
dev_label_list = []
dev_dataset = []
for a in dev_split:
    word_embedding=[]
    dev_label_list.append(a[0])
    dev_seq_len.append(len(a[1]))
    for word in a[1]:
        if word not in wvmodel.wv:
            word_embedding.append([0]*100)
        else:
            word_embedding.append(wvmodel.wv[word])

    dev_dataset.append(torch.tensor(word_embedding, dtype=torch.float))

for i in range(len(dev_dataset)):
    dev_dataset[i] = F.pad(dev_dataset[i], pad=(0, 0, 0, max_len - dev_dataset[i].shape[0]))

test_dataset = []
for a in test_split:
    word_embedding=[]
    for word in a[0]:
        if word not in wvmodel.wv:
            word_embedding.append([0]*100)
        else:
            word_embedding.append(wvmodel.wv[word])

    test_dataset.append(torch.tensor(word_embedding, dtype=torch.float))

for i in range(len(test_dataset)):
    test_dataset[i] = F.pad(test_dataset[i], pad=(0, 0, 0, max_len - test_dataset[i].shape[0]))









  train_dataset.append(torch.tensor(word_embedding, dtype=torch.float))


#### LSTM

In [8]:
# EMBEDDING_DIM = 100
# HIDDEN_DIM = 5
# class cipher_LSTM(nn.Module):

#     def __init__(self, embedding_dim, hidden_dim,max_len):
#         super(cipher_LSTM, self).__init__()
#         # self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim)
#         # self.multihead_attn = nn.MultiheadAttention(embedding_dim, hidden_dim)
#         self.linear1 = nn.Linear(hidden_dim, 1)
#         self.linear2 = nn.Linear(max_len, 1)

#     def forward(self, sentence):
#         # embeds = self.word_embeddings(sentence)
#         lstm_out, _ = self.lstm(sentence.view(len(sentence), 1, -1))
#         # lstm_out, _ = self.multihead_attn(sentence.view(len(sentence), 1, -1))
#         hidden_space = self.linear1(lstm_out.view(len(sentence), -1))
#         output = self.linear2(hidden_space.view(-1,len(sentence)))
#         output = F.sigmoid(output.squeeze())
#         return output

In [9]:
# lstm_model = cipher_LSTM(EMBEDDING_DIM, HIDDEN_DIM, max_len)
# loss_function = nn.BCELoss()
# optimizer = optim.SGD(lstm_model.parameters(), lr=0.001)

# for i in range(len(train_dataset)):
    
#     optimizer.zero_grad()

#     predict = lstm_model(train_dataset[i])
#     # print(predict.view(-1,1))
#     label = label_list[i]
#     label = torch.tensor(label,dtype=torch.float)
#     # print(label.view(-1,1))
#     loss = loss_function(predict.view(-1), label.view(-1))
#     # print(loss.data)
#     loss.backward()
#     optimizer.step()
#     # if i == 1000:
#     #     break
# correct = 0
# for i in range(100):
#     predict = lstm_model(train_dataset[i])
    
#     if int(torch.round(predict).data)==label_list[i]:
#         correct+=1
# print(correct)
# correct = 0
# # for i in range(100):
# for i in range(len(dev_dataset)):
#     predict = lstm_model(dev_dataset[i])
#     # print(predict.data,dev_label_list[i])
#     if int(torch.round(predict).data)==dev_label_list[i]:
#         correct+=1
# print(correct,len(dev_dataset))

#### Self-Attention

In [10]:
class selfattention(nn.Module):

    def __init__(self, embedding_dim,max_len):
        super(selfattention, self).__init__()

        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, dim_feedforward=1024, nhead=2)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)
        self.linear1 = nn.Linear(embedding_dim, 1)
        self.linear2 = nn.Linear(max_len, 1)

    def forward(self, sentence):

        encoder_out = self.encoder(sentence.view(len(sentence), 1, -1))

        hidden_space = self.linear1(encoder_out.view(len(sentence), -1))
        output = self.linear2(hidden_space.view(-1,len(sentence)))
        output = F.sigmoid(output.squeeze())
        return output

In [11]:
max_len = 100
EMBEDDING_DIM = 100
selfatten_model = selfattention(EMBEDDING_DIM, max_len)
loss_function = nn.BCELoss()
optimizer = optim.SGD(selfatten_model.parameters(), lr=0.01)
loss_list = []
for i in range(len(train_dataset)):
    optimizer.zero_grad()
    predict = selfatten_model(train_dataset[i])
    label = label_list[i]
    label = torch.tensor(label,dtype=torch.float)
    loss = loss_function(predict.view(-1), label.view(-1))
    loss.backward()
    optimizer.step()
    # if i == 100:
    #     break




#### Predict Test Data

In [12]:
results = []
for sentence in test_dataset:
    predict = selfatten_model(sentence)
    results.append(int(torch.round(predict.data)))


#### Evaluation

In [17]:
# correct = 0
# for i in range(1000):
# # for i in range(len(train_dataset)):
#     predict = selfatten_model(train_dataset[i])
    
#     if int(torch.round(predict).data)==label_list[i]:
#         correct+=1
# print(correct,len(train_dataset))
correct = 0
for i in range(len(dev_dataset)):
    predict = selfatten_model(dev_dataset[i])
    
    if int(torch.round(predict).data)==dev_label_list[i]:
        correct+=1
# print(correct/len(dev_dataset))

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [14]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [15]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [16]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')