In [1]:
# common packages
import pandas as pd
import time

# # DL framework
import torch
from torchtext import data

from attractivedata import AttractiveData
from trainer import AttractiveTrainer

In [2]:
seed_value = 42
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value) # gpu vars
torch.backends.cudnn.deterministic = True  #needed
torch.backends.cudnn.benchmark = False

## Load and prepare data

In [3]:
train_file = 'data/train.csv'
val_file = 'example/val.csv'
test_file = 'data/test.csv'
pretrained_file = 'glove.840B.300d'
config = {
    'max_seq': 40,
    'min_freq': 0,
    'batch_size': 51,
    'pretrained_file': pretrained_file
}


In [4]:
AttractiveData = AttractiveData(train_file, val_file, test_file, pretrained_file, config)

## Start to train

In [5]:
config['timestr'] = time.strftime("%Y%m%d-%H%M%S")
config['save_name'] = 'CNN_LSTM'
config['input_dim'] = len(AttractiveData.TEXT.vocab)
config['embedding_dim'] = 300
config['category_dim'] = len(AttractiveData.CATEGORIES_LABEL.vocab)
config['category_embedding_dim'] = 10
config['hidden_dim'] = 30
config['output_dim'] = 1
config['log_steps'] = 10
config['epochs'] = 150
config['lr'] = {
    'encoder': 1e-5,
    'embedding': 6e-6,
    'linear': 1e-5
}
config['num_layers'] = 1
config['kernel_size'] = 3
config['dropout'] = 0.5
config['train_len'] = AttractiveData.train_len
config['val_len'] = AttractiveData.val_len
config['test_len'] = AttractiveData.test_len

pretrained_embeddings = AttractiveData.TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([12699, 300])


In [6]:
AttractiveTrainer = AttractiveTrainer(config, AttractiveData.device, AttractiveData.trainloader, AttractiveData.valloader, pretrained_embeddings)

In [7]:
AttractiveTrainer.model, AttractiveTrainer.config['total_params'], AttractiveTrainer.config['total_learned_params']

(AttractiveNet(
   (embedding): AttractiveEmbedding(
     (token): TokenEmbedding(12699, 300, padding_idx=1)
   )
   (unigramcnn): Sequential(
     (0): Conv1d(300, 210, kernel_size=(1,), stride=(1,))
     (1): ReLU()
     (2): Conv1d(210, 100, kernel_size=(1,), stride=(1,))
     (3): ReLU()
     (4): Dropout(p=0.5, inplace=False)
   )
   (bigramcnn): Sequential(
     (0): Conv1d(300, 210, kernel_size=(2,), stride=(1,), padding=(1,))
     (1): ReLU()
     (2): Conv1d(210, 100, kernel_size=(2,), stride=(1,), padding=(1,))
     (3): ReLU()
     (4): Dropout(p=0.5, inplace=False)
   )
   (trigramcnn): Sequential(
     (0): Conv1d(300, 210, kernel_size=(3,), stride=(1,), padding=(1,))
     (1): ReLU()
     (2): Conv1d(210, 100, kernel_size=(3,), stride=(1,), padding=(1,))
     (3): ReLU()
     (4): Dropout(p=0.5, inplace=False)
   )
   (encoder_unigram): LSTM(100, 30, batch_first=True, dropout=0.5, bidirectional=True)
   (encoder_bigram): LSTM(100, 30, batch_first=True, dropout=0.5, bidire

In [8]:
AttractiveTrainer.train()

Epoch:   1%|          | 1/150 [00:00<01:39,  1.50it/s]
EP_0 | train loss: 3.075334851882037 | val loss: 3.3241898031795727 |
Epoch:   1%|▏         | 2/150 [00:01<01:36,  1.53it/s]
EP_1 | train loss: 0.6115363794214586 | val loss: 0.6627444098977482 |
Epoch:   2%|▏         | 3/150 [00:01<01:35,  1.54it/s]
EP_2 | train loss: 0.5654434232150807 | val loss: 0.6663021947823319 |
Epoch:   3%|▎         | 4/150 [00:02<01:33,  1.56it/s]
EP_3 | train loss: 0.5545006172329772 | val loss: 0.6798208835078221 |
Epoch:   3%|▎         | 5/150 [00:03<01:32,  1.56it/s]
EP_4 | train loss: 0.5539331651201436 | val loss: 0.6613368427052218 |
Epoch:   4%|▍         | 6/150 [00:03<01:31,  1.57it/s]
EP_5 | train loss: 0.5480331785538617 | val loss: 0.6457584325005027 |
Epoch:   5%|▍         | 7/150 [00:04<01:30,  1.58it/s]
EP_6 | train loss: 0.5460569952048507 | val loss: 0.6417629765529259 |
Epoch:   5%|▌         | 8/150 [00:05<01:32,  1.54it/s]
EP_7 | train loss: 0.5489611176883473 | val loss: 0.630022862378

## Below is testing

In [6]:
from attractivenet import AttractiveNet

PATH = './model/CNN_LSTM_20201110-204941/0.376489.68'

load_model = AttractiveNet(config).to(AttractiveData.device)
load_model.load_state_dict(torch.load(PATH))
load_model.eval()

AttractiveNet(
  (embedding): AttractiveEmbedding(
    (token): TokenEmbedding(12699, 300, padding_idx=1)
  )
  (unigramcnn): Sequential(
    (0): Conv1d(300, 210, kernel_size=(1,), stride=(1,))
    (1): ReLU()
    (2): Conv1d(210, 100, kernel_size=(1,), stride=(1,))
    (3): ReLU()
    (4): Dropout(p=0.5, inplace=False)
  )
  (bigramcnn): Sequential(
    (0): Conv1d(300, 210, kernel_size=(2,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): Conv1d(210, 100, kernel_size=(2,), stride=(1,), padding=(1,))
    (3): ReLU()
    (4): Dropout(p=0.5, inplace=False)
  )
  (trigramcnn): Sequential(
    (0): Conv1d(300, 210, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): Conv1d(210, 100, kernel_size=(3,), stride=(1,), padding=(1,))
    (3): ReLU()
    (4): Dropout(p=0.5, inplace=False)
  )
  (encoder_unigram): LSTM(100, 30, batch_first=True, dropout=0.5, bidirectional=True)
  (encoder_bigram): LSTM(100, 30, batch_first=True, dropout=0.5, bidirectional=True)
  (encoder_tr

In [7]:
def predict_attractive(sentence, category, phase):
    indexed_sentence = [AttractiveData.TEXT.vocab.stoi[t] for t in sentence]
    indexed_category = [AttractiveData.CATEGORIES_LABEL.vocab.stoi[category]]
    tensor_sentence = torch.LongTensor(indexed_sentence).to(AttractiveData.device)
    tensor_category = torch.LongTensor(indexed_category).to(AttractiveData.device)
    tensor_sentence = tensor_sentence.unsqueeze(0)

    prediction = load_model(tensor_sentence, tensor_category, phase=phase)

    return prediction

In [8]:
predict_list = []
with torch.no_grad():
    for i, sentence in enumerate(AttractiveData.test_data):
        prediction = predict_attractive(sentence.Headline, sentence.Category, 'test')
        predict_list.append(prediction.item())

AttractiveData.df_test['Label'] = predict_list
AttractiveData.df_test[['ID', 'Label']].to_csv(config['save_name'] + '.csv', index=False)

## Below for statistics

In [9]:
import statistics
from sklearn.metrics import mean_squared_error

In [10]:
# # train mean = 3.15, test mean = 2.8
# train_list = []
# for i, sentence in enumerate(AttractiveData.train_data):
#     prediction = predict_attractive(sentence.Headline, sentence.Category, 'train')
#     train_list.append(prediction.item())
#     # train_list.append(prediction.item())
# # print(train_list)
# mean_squared_error(pd.read_csv('data/train.csv').sort_values(['ID']).Label.to_list(), train_list), statistics.mean(train_list), statistics.stdev(train_list)

In [11]:
# train_list[0:5], pd.read_csv('data/train.csv').sort_values(['ID']).Label.to_list()[0:5]

In [12]:
# a = AttractiveData.df_train['Label'].to_list()
# statistics.mean(a), statistics.stdev(a)

In [13]:
statistics.mean(predict_list), statistics.stdev(predict_list)

(2.8394153664290642, 0.3328727130066473)

In [14]:
baseline_list = pd.read_csv('baseline.csv').sort_values(['ID']).Label.to_list()
mean_squared_error(baseline_list, predict_list), statistics.mean(baseline_list), statistics.stdev(baseline_list)

(0.009655923729345614, 2.8142020345259344, 0.36798823904910916)

In [17]:
mean_squared_error(pd.read_csv('baseline.csv').sort_values(['ID']).Label.to_list(), pd.read_csv('../309551062/predict/CNN_LSTM_20201109-125007_0.374958.75.csv').sort_values(['ID']).Label.to_list())

0.0