### Coursework coding instructions (please also see full coursework spec)

Please choose if you want to do either Task 1 or Task 2. You should write your report about one task only.

For the task you choose you will need to do two approaches:
  - Approach 1, which can use use pre-trained embeddings / models
  - Approach 2, which should not use any pre-trained embeddings or models
We should be able to run both approaches from the same colab file

#### Running your code:
  - Your models should run automatically when running your colab file without further intervention
  - For each task you should automatically output the performance of both models
  - Your code should automatically download any libraries required

#### Structure of your code:
  - You are expected to use the 'train', 'eval' and 'model_performance' functions, although you may edit these as required
  - Otherwise there are no restrictions on what you can do in your code

#### Documentation:
  - You are expected to produce a .README file summarising how you have approached both tasks

#### Reproducibility:
  - Your .README file should explain how to replicate the different experiments mentioned in your report

Good luck! We are really looking forward to seeing your reports and your model code!

In [None]:
# Data files are saved in the repository
!git clone https://github.com/wang007123/NLP-CW1.git

Cloning into 'NLP-CW1'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 109 (delta 29), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (109/109), 652.08 KiB | 6.15 MiB/s, done.
Resolving deltas: 100% (29/29), done.


In [None]:
# Imports
import torch
import sys
import spacy
import math
import copy

import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
from matplotlib import pyplot as plt

from torch import nn
from torch.autograd import Variable
from copy import deepcopy
from sklearn.metrics import accuracy_score
from torchtext import data
from torchtext.vocab import Vectors


# CW original import
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs

In [None]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [None]:
# The paths of the data files in the repository

train_file = '/content/NLP_CW1/Matrixposer/data/train.csv'
test_file = '/content/NLP_CW1/Matrixposer/data/test.csv'
val_file = '/content/NLP_CW1/Matrixposer/data/dev.csv'

In [None]:
# Number of epochs
epochs = 40

# Proportion of training data for train compared to dev
train_proportion = 0.8

In [None]:
# Other parameters
class Config(object):
    N = 2
    d_model = 100
    d_ff = 512
    d_row = 60
    dropout = 0.1
    output_size = 1
    lr = 0.013
    batch_size = 64
    max_sen_len = 60
    pre_trained = True

In [None]:
# We define our training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    print("Training model.")
    test_pre = None
    for epoch in range(1, number_epoch+1):

        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0


        # Reduce learning rate as number of epochs increase
        if (number_epoch == int(epochs / 3)) or (number_epoch == int(2 * epochs / 3)):
            self.reduce_lr()

        for batch in train_iter:
            
            feature, target = batch.text.to(device), (batch.label).to(device)

            no_observations = no_observations + batch.label.shape[0]

            predictions = model(feature)
            
            optimizer.zero_grad()
            loss = criterion(predictions.view(-1), target)
            sse, __ = model_performance(predictions.view(-1).detach().cpu().numpy(), target.detach().cpu().numpy())
            
            loss.backward()
            
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, pre, tar = eval(dev_iter, model)

        test_loss, test_mse, test_pre, test_tar = eval(dataset.test_iterator, model)
        
        epoch_loss, epoch_mse = epoch_loss/ no_observations, epoch_sse / no_observations


        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} |Train MSE: {epoch_mse:.2f} | Train RMSE: {(epoch_mse)**0.5:.2f} | \
         Val. Loss: {valid_loss:.2f} |Val. MSE: {valid_mse:.2f} |Val. RMSE: {valid_mse **0.5:.2f} | \
         Test. Loss: {test_loss:.2f} |Test. MSE: {test_mse:.2f} |Test. RMSE: {test_mse **0.5:.2f} |')
    '''
    #for hyper-parameter tuning
    return number_epoch, lr, (epoch_mse)**0.5,epoch_loss, (valid_mse)**0.5,valid_loss, (test_mse)**0.5, test_loss,config.dropout
    '''
    return test_pre

In [None]:
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch.text, batch.label

            feature, target = feature.to(device), target.to(device)
            
            no_observations = no_observations + target.shape[0]
            predictions = model(feature)
            
            loss = criterion(predictions.view(-1), target)

            pred, trg = predictions.view(-1).detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*batch.label.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)
            
    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [None]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [None]:
class Matposer(nn.Module):
    def __init__(self, config, src_vocab, pre_trained=False):
        super(Matposer, self).__init__()
        self.config = config

        d_row, N, dropout = self.config.d_row, self.config.N, self.config.dropout
        d_model, d_ff = self.config.d_model, self.config.d_ff

        inter = Interactor(d_model, d_ff, out_row=d_row, dropout=dropout)
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)

        self.encoder = Encoder(EncoderLayer(d_model, deepcopy(inter), deepcopy(ff), dropout), N)
        self.src_embed = nn.Sequential(
            Embeddings(d_model, src_vocab, pre_trained), deepcopy(position)
        )
        self.fc = nn.Linear(
            d_model,
            self.config.output_size
        )


    def forward(self, x):
        embedded_sents = self.src_embed(x.permute(1, 0)) 
        encoded_sents = self.encoder(embedded_sents)
        final_feature_map = torch.sum(encoded_sents,1)
        final_out = self.fc(final_feature_map)
        return final_out

    def reduce_lr(self):
        print("Reducing LR")
        for g in self.optimizer.param_groups:
            g['lr'] = g['lr'] / 2

In [None]:
class Encoder(nn.Module):
    '''
    Matposer Encoder
    It is a stack of N layers.
    '''
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return self.norm(x)


class EncoderLayer(nn.Module):
    '''
    An encoder layer
    Made up of Interactor and a feed forward layer
    Each of these sublayers have residual and layer norm, implemented by SublayerOutput.
    '''
    def __init__(self, size, interactor, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.interactor = interactor
        self.feed_forward = feed_forward
        self.sublayer = SublayerOutput(size, dropout)
        self.size = size

    def forward(self, x):
        "Matposer Encoder"
        x = self.interactor(x)
        return self.sublayer(x, self.feed_forward)


class PositionwiseFeedForward(nn.Module):
    "Positionwise feed-forward network."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        "Implements FFN equation."
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

class Column_wise_nn(nn.Module):
    def __init__(self, d_row, d_column, d_ff, dropout=None):
        '''
        initialize column-wise neural network
        :param d_row: input row number
        :param d_ff: middle size row number
        :param dropout: default None
        '''
        super(Column_wise_nn, self).__init__()
        self.w_1 = nn.Linear(d_row, d_ff)
        self.w_2 = nn.Linear(d_ff, d_column)
        if dropout is not None:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None

    def forward(self, x):
        x = x.permute(0,2,1)
        d_k = x.size(-1)
        output = self.w_2(self.dropout(F.relu(self.w_1(x)))) / math.sqrt(d_k)
        if self.dropout is not None:
            output = self.dropout(output)
        return output.permute(0,2,1)

class Row_wise_nn(nn.Module):
    def __init__(self, d_column, d_ff, out_row, dropout=None):
        super(Row_wise_nn, self).__init__()
        self.w_1 = nn.Linear(d_column, d_ff)
        self.w_2 = nn.Linear(d_ff, out_row)
        if dropout is not None:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None

    def forward(self, x):
        d_k = x.size(-1)
        output = self.w_2(self.dropout(F.relu(self.w_1(x)))) / math.sqrt(d_k)
        output = F.softmax(output, dim=-1)
        if self.dropout is not None:
            output = self.dropout(output)

        return output


class Interactor(nn.Module):
    def __init__(self, d_column, d_ff, out_row=30, dropout=0.1):
        '''
        :param d_row: dimension of output row number
        :param d_column: dimension of input column number
        :param d_ff: dimension of middle neural
        :param dropout: default 0.1
        '''
        super(Interactor, self).__init__()
        self.column_wise_nn = Column_wise_nn(out_row, d_column, d_ff, dropout)
        self.row_wise_nn = Row_wise_nn(d_column, d_ff, out_row, dropout)

    def forward(self, x):
        left_transposer = self.row_wise_nn(x)
        middle_term = torch.matmul(left_transposer.permute(0,2,1), x)
        output = self.column_wise_nn(middle_term)
        return output


class LayerNorm(nn.Module):
    "Construct a layer normalization module."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
        
        
class SublayerOutput(nn.Module):
    '''
    A residual connection followed by a layer norm
    '''

    def __init__(self, size, dropout):
        super(SublayerOutput, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class Embeddings(nn.Module):
    '''
    Usual Embedding layer with weights multiplied by sqrt(d_model)
    '''

    def __init__(self, d_model, vocab, pre_trained=False):
        super(Embeddings, self).__init__()
        if pre_trained is False:
            self.lut = nn.Embedding(len(vocab), d_model)
        else:
            self.lut = nn.Embedding.from_pretrained(vocab)
        self.d_model = d_model

        # self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)


class PositionalEncoding(nn.Module):
    "Implement the PE function"

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(torch.as_tensor(position.numpy() * div_term.unsqueeze(0).numpy()))
        pe[:, 1::2] = torch.cos(
            torch.as_tensor(position.numpy() * div_term.unsqueeze(0).numpy()))  # torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],
                         requires_grad=False)
        return self.dropout(x)


def get_embedding_matrix(vocab_chars):
    # return one hot emdding
    vocabulary_size = len(vocab_chars)
    onehot_matrix = np.eye(vocabulary_size, vocabulary_size)
    return onehot_matrix

def clones(module, N):
    "Produce N identical layers"
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
class Dataset(object):
    def __init__(self, config):
        self.config = config
        self.train_iterator = None
        self.test_iterator = None
        self.val_iterator = None
        self.vocab = []
        self.word_embeddings = {}

    def load_data(self, train_file, test_file, val_file=None, pre_trained=False):
        '''
        Loads the data from files
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data
        Inputs:
            train_file (String): absolute path to training file
            test_file (String): absolute path to test file
            val_file (String): absolute path to validation file
        '''
        # Loading Tokenizer
        NLP = spacy.load('en')

        def tokenizer(sent):
            return list(
                x.text for x in NLP.tokenizer(sent) if x.text != " ")

        # Creating Filed for data
        TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len)
        LABEL = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
        
        datafields = [("text", TEXT), ("label", LABEL)]

        # Load data from csv into torchtext.data.Dataset
        # train
        train_df = pd.read_csv(train_file)

        train_df = train_df[['original', 'meanGrade']]
        train_df = train_df.rename(columns={'original': "text", 'meanGrade': 'label'})
        train_examples = [
            data.Example.fromlist(i, datafields) for i in train_df.values.tolist()]
        train_data = data.Dataset(train_examples, datafields)
        
        # test
        test_df = pd.read_csv(test_file)

        test_df = test_df[['original', 'meanGrade']]
        test_df = test_df.rename(columns={'original': "text", 'meanGrade': 'label'})
        
        test_examples = [
            data.Example.fromlist(
                i, datafields) for i in test_df.values.tolist()]
        test_data = data.Dataset(test_examples, datafields)

        # If validation file exists, load it. Otherwise get validation data
        # from training data
        if val_file:
           
            val_df = pd.read_csv(val_file)
            val_df = val_df[['original', 'meanGrade']]
            val_df = val_df.rename(columns={'original': "text",'meanGrade': 'label'})
            
            val_examples = [
                data.Example.fromlist(
                    i, datafields) for i in val_df.values.tolist()]
            val_data = data.Dataset(val_examples, datafields)
        else:
            train_data, val_data = train_data.split(split_ratio=0.8)

        if pre_trained:
            TEXT.build_vocab(train_data, vectors='glove.6B.100d')
            self.vocab = TEXT.vocab.vectors
        else:
            TEXT.build_vocab(train_data)
            self.vocab = TEXT.vocab

        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True
        )
        self.val_iterator = data.BucketIterator(
            (val_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False
        )
        self.test_iterator = data.BucketIterator(
            (test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False
        )

#### Approach ONE

In [None]:
# Approach ONE
torch.cuda.empty_cache()
config = Config
config.pre_trained = True

dataset = Dataset(config)
dataset.load_data(train_file, test_file, val_file, config.pre_trained)

    
model = Matposer(config, dataset.vocab, config.pre_trained)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=config.lr)

criterion = nn.MSELoss()
criterion = criterion.to(device)
predicted = train(dataset.train_iterator, dataset.val_iterator,model, epochs)

Training model.
| Epoch: 01 | Train Loss: 1109.61 |Train MSE: 1109.61 | Train RMSE: 33.31 |          Val. Loss: 0.50 |Val. MSE: 0.50 |Val. RMSE: 0.71 |          Test. Loss: 0.50 |Test. MSE: 0.50 |Test. RMSE: 0.71 |
| Epoch: 02 | Train Loss: 2.54 |Train MSE: 2.54 | Train RMSE: 1.59 |          Val. Loss: 0.38 |Val. MSE: 0.38 |Val. RMSE: 0.61 |          Test. Loss: 0.38 |Test. MSE: 0.38 |Test. RMSE: 0.61 |
| Epoch: 03 | Train Loss: 2.27 |Train MSE: 2.27 | Train RMSE: 1.51 |          Val. Loss: 0.36 |Val. MSE: 0.36 |Val. RMSE: 0.60 |          Test. Loss: 0.36 |Test. MSE: 0.36 |Test. RMSE: 0.60 |
| Epoch: 04 | Train Loss: 2.06 |Train MSE: 2.06 | Train RMSE: 1.43 |          Val. Loss: 0.38 |Val. MSE: 0.38 |Val. RMSE: 0.62 |          Test. Loss: 0.38 |Test. MSE: 0.38 |Test. RMSE: 0.62 |
| Epoch: 05 | Train Loss: 1.93 |Train MSE: 1.93 | Train RMSE: 1.39 |          Val. Loss: 1.03 |Val. MSE: 1.03 |Val. RMSE: 1.01 |          Test. Loss: 1.04 |Test. MSE: 1.04 |Test. RMSE: 1.02 |
| Epoch: 06 | Tra

#### Approach 2: No pre-trained representations

In [None]:
# Load data
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
train_and_dev = train_df['edit']

training_data, dev_data, training_y, dev_y = train_test_split(train_df['edit'], train_df['meanGrade'],
                                                                        test_size=(1-train_proportion),
                                                                        random_state=42)

# We train a Tf-idf model
count_vect = CountVectorizer(stop_words='english')
train_counts = count_vect.fit_transform(training_data)
transformer = TfidfTransformer().fit(train_counts)
train_counts = transformer.transform(train_counts)
regression_model = LinearRegression().fit(train_counts, training_y)

# Train predictions
predicted_train = regression_model.predict(train_counts)

# Calculate Tf-idf using train and dev, and validate model on dev:
test_and_test_counts = count_vect.transform(train_and_dev)
transformer = TfidfTransformer().fit(test_and_test_counts)

test_counts = count_vect.transform(dev_data)

test_counts = transformer.transform(test_counts)

# Dev predictions
predicted = regression_model.predict(test_counts)

# We run the evaluation:
print("\nTrain performance:")
sse, mse = model_performance(predicted_train, training_y, True)

print("\nDev performance:")
sse, mse = model_performance(predicted, dev_y, True)


Train performance:
| MSE: 0.13 | RMSE: 0.37 |

Dev performance:
| MSE: 0.36 | RMSE: 0.60 |


#### Baseline for task 2

In [None]:
# Baseline for the task
pred_baseline = torch.zeros(len(dev_y)) + np.mean(training_y)
print("\nBaseline performance:")
sse, mse = model_performance(pred_baseline, dev_y, True)


Baseline performance:
| MSE: 0.34 | RMSE: 0.58 |


#### Approach TWO

In [None]:
config.pre_trained = False
config.lr = 0.0001
config.dropout = 0.1
model = Matposer(config, dataset.vocab, config.pre_trained)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=config.lr)

criterion = nn.MSELoss()
criterion = criterion.to(device)
predicted = train(dataset.train_iterator, dataset.val_iterator,model, epochs)

Training model.
| Epoch: 01 | Train Loss: 36.38 |Train MSE: 36.38 | Train RMSE: 6.03 |          Val. Loss: 10.82 |Val. MSE: 10.82 |Val. RMSE: 3.29 |          Test. Loss: 10.75 |Test. MSE: 10.75 |Test. RMSE: 3.28 |
| Epoch: 02 | Train Loss: 7.55 |Train MSE: 7.55 | Train RMSE: 2.75 |          Val. Loss: 1.50 |Val. MSE: 1.50 |Val. RMSE: 1.22 |          Test. Loss: 1.48 |Test. MSE: 1.48 |Test. RMSE: 1.22 |
| Epoch: 03 | Train Loss: 5.14 |Train MSE: 5.14 | Train RMSE: 2.27 |          Val. Loss: 0.80 |Val. MSE: 0.80 |Val. RMSE: 0.89 |          Test. Loss: 0.78 |Test. MSE: 0.78 |Test. RMSE: 0.88 |
| Epoch: 04 | Train Loss: 3.59 |Train MSE: 3.59 | Train RMSE: 1.89 |          Val. Loss: 1.38 |Val. MSE: 1.38 |Val. RMSE: 1.17 |          Test. Loss: 1.34 |Test. MSE: 1.34 |Test. RMSE: 1.16 |
| Epoch: 05 | Train Loss: 2.73 |Train MSE: 2.73 | Train RMSE: 1.65 |          Val. Loss: 1.18 |Val. MSE: 1.18 |Val. RMSE: 1.09 |          Test. Loss: 1.14 |Test. MSE: 1.14 |Test. RMSE: 1.07 |
| Epoch: 06 | Trai

#### Generate CSV output for competition

In [None]:
# this part of code used to generate the competion out
import csv
test_df_id = pd.read_csv(test_file)
test_df_id = test_df_id[['id']]
test_df_id = test_df_id.values.tolist()

with open('task-1-output.csv', 'w', newline='') as outcsv:
    writer = csv.DictWriter(outcsv, fieldnames = ['id', 'pred'])
    writer.writeheader()
    for i in range(len(test_df_id)):
        writer.writerow({'id': str(test_df_id[i][0]), 'pred': str(predicted[i])})


Hyper-parameter Tuning

In [None]:
'''
# Here are the example codes for our hyper-parameter tuning.
# To run this, you also needs to modify the return part of train()
# Approach 1
torch.cuda.empty_cache()
config = Config
train_file = '/content/NLP_CW1/Matrixposer/data/train.csv'
test_file = '/content/NLP_CW1/Matrixposer/data/test.csv'
val_file = '/content/NLP_CW1/Matrixposer/data/dev.csv'

pre_output = []
no_pre_output = []

# Number of epochs
num_epochs = [40]
#num_epochs = np.arange(10,80,10)
num_lr = [0.013]
#num_lr = np.arange(0.0001,0.1,0.001)
dropouts = np.arange(0,1,0.02)

config.pre_trained = True
for epochs in num_epochs:
    for lr in num_lr:
        for dropout_ in dropouts:
            config.dropout = dropout_ 
            config.lr = lr
            dataset = Dataset(config)
            dataset.load_data(train_file, test_file, val_file, config.pre_trained)
            model = Matposer(config, dataset.vocab, config.pre_trained)
            if torch.cuda.is_available():
                model.cuda()
            optimizer = optim.Adam(model.parameters(), lr=config.lr)
            criterion = nn.MSELoss()
            criterion = criterion.cuda()
            hype_epochs, hype_lr, hype_train, hype_train_loss, hype_valid, hype_valid_loss, hype_test,hype_test_loss,hype_drop = train(dataset.train_iterator, dataset.val_iterator,model, epochs)
            pre_output.append([hype_epochs, hype_lr, hype_train, hype_train_loss, hype_valid, hype_valid_loss, hype_test,hype_test_loss,hype_drop])

# Approach 2
config.pre_trained = False
num_epochs = [40]
#num_epochs = np.arange(10,80,10)
num_lr = [0.0001]
#num_lr = np.arange(0.0001,0.1,0.001)
dropouts = np.arange(0,1,0.02)
for epochs in num_epochs:
    for lr in num_lr:
        for dropout_ in dropouts:
            config.dropout = dropout_ 
            config.lr = lr
            dataset = Dataset(config)
            dataset.load_data(train_file, test_file, val_file, config.pre_trained)
            model = Matposer(config, dataset.vocab, config.pre_trained)
            if torch.cuda.is_available():
                model.cuda()
            optimizer = optim.Adam(model.parameters(), lr=config.lr)
            criterion = nn.MSELoss()
            criterion = criterion.cuda()
            hype_epochs, hype_lr, hype_train, hype_train_loss, hype_valid, hype_valid_loss, hype_test,hype_test_loss,hype_drop = train(dataset.train_iterator, dataset.val_iterator,model, epochs)
            no_pre_output.append([hype_epochs, hype_lr, hype_train, hype_train_loss, hype_valid, hype_valid_loss, hype_test,hype_test_loss,hype_drop])
'''

In [None]:
'''
# Here are the example codes for generate hyper-parameter graph.
def plot_test(input_data, pre):
    plt.figure()
    for epoch_ in num_epochs:
        drop_list,epoch_list, lr_list, train_RMSE, valid_RMSE, test_RMSE, tran_loss, valid_loss, test_loss = [],[],[],[],[],[],[],[],[]
        for item in input_data:
            if epoch_ == item[0]: 
                lr_list.append(item[1])
                valid_RMSE.append(item[4])
                drop_list.append(item[8]) 
        plt.plot(drop_list, valid_RMSE, label = 'Valid RMSE epoch = {}'.format(epoch_))
    plt.xlabel('Dropout')
    plt.ylabel('RMSE')
    if pre:
        plt.title('Dropout V.S RMSE with pre-trained')
    else:
        plt.title('Dropout V.S RMSE with no pre-trained')
        plt.ylim((0.4,1.5))
    plt.legend(loc='upper left')
    if pre:
        plt.savefig('/content/hype_drop_testRmse_pre_.jpg')
    else:
        plt.savefig('/content/hype_drop_testRmse_nopre.jpg')
    plt.show()

    plt.figure()
    for epoch_ in num_epochs:
        drop_list, epoch_list, lr_list, train_RMSE, valid_RMSE, test_RMSE, tran_loss, valid_loss, test_loss = [],[],[],[],[],[],[],[],[]
        for item in input_data:
            if epoch_ == item[0]:
                lr_list.append(item[1])
                valid_loss.append(item[5])
                drop_list.append(item[8])
        plt.plot(drop_list, valid_loss, label = 'valid Loss epoch = {}'.format(epoch_))
    plt.xlabel('Dropout')
    plt.ylabel('Loss')

    if pre:
        plt.title('Dropout V.S Loss with pre-trained')
    else:
        plt.ylim((0.4,1.5))
        plt.title('Dropout V.S Loss with no pre-trained')

    plt.legend(loc='upper left')
    if pre:
        plt.savefig('/content/hype_drop_testLoss_pre.jpg')
    else:
        plt.savefig('/content/hype_drop_testLoss_nopre.jpg')
    plt.show()

plot_test(pre_output,True)
plot_test(no_pre_output,False)
'''