In [1]:
# common packages
import pandas as pd
import time

# # DL framework
import torch
from torchtext import data

from attractivedata import AttractiveData
from trainer import AttractiveTrainer

## Load and prepare data

In [2]:
train_file = 'data/train.csv'
test_file = 'data/test.csv'
pretrained_file = 'glove.840B.300d'
config = {
    'max_seq': 40,
    'min_freq': 0,
    'batch_size': 64,
    'pretrained_file': pretrained_file
}


In [3]:
AttractiveData = AttractiveData(train_file, test_file, pretrained_file, config)

In [4]:
# for i, sentence in enumerate(AttractiveData.test_data):
#     if i == 3:
#         print(vars(AttractiveData.train_data[i]), vars(sentence))

In [5]:
len(AttractiveData.CATEGORIES_LABEL.vocab.freqs)

18

In [6]:
max_len = 0
a = AttractiveData.train_data
for i in range(len(a)):
    if len(a[i].Headline) >= max_len:
        max_len = len(a[i].Headline)
max_len

38

## Start to train

In [7]:
config['timestr'] = time.strftime("%Y%m%d-%H%M%S")
config['save_name'] = 'CNN_LSTM'
config['input_dim'] = len(AttractiveData.TEXT.vocab)
config['embedding_dim'] = 300
config['category_dim'] = len(AttractiveData.CATEGORIES_LABEL.vocab)
config['category_embedding_dim'] = 5
config['hidden_dim'] = 30
config['output_dim'] = 1
config['log_steps'] = 10
config['epochs'] = 150
config['lr'] = {
    'encoder': 1e-5,
    'embedding': 1e-5,
    'linear': 1e-5
}
config['num_layers'] = 1
config['kernel_size'] = 3
config['dropout'] = 0.1
config['train_len'] = AttractiveData.train_len
config['val_len'] = AttractiveData.val_len
config['test_len'] = AttractiveData.test_len

pretrained_embeddings = AttractiveData.TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([12699, 300])


In [8]:
AttractiveTrainer = AttractiveTrainer(config, AttractiveData.device, AttractiveData.trainloader, AttractiveData.valloader, pretrained_embeddings)

In [9]:
AttractiveTrainer.model, AttractiveTrainer.config['total_params'], AttractiveTrainer.config['total_learned_params']

(AttractiveNet(
   (embedding): AttractiveEmbedding(
     (token): TokenEmbedding(12699, 300, padding_idx=1)
   )
   (cnn1): Sequential(
     (0): Conv1d(300, 220, kernel_size=(3,), stride=(1,), padding=(1,))
     (1): ReLU()
   )
   (cnn2): Sequential(
     (0): Conv1d(220, 150, kernel_size=(3,), stride=(1,), padding=(1,))
     (1): ReLU()
   )
   (cnn3): Sequential(
     (0): Conv1d(150, 100, kernel_size=(3,), stride=(1,), padding=(1,))
     (1): ReLU()
   )
   (encoder): LSTM(100, 30, batch_first=True, dropout=0.1, bidirectional=True)
   (linear): Sequential(
     (0): Linear(in_features=120, out_features=30, bias=True)
     (1): ReLU()
     (2): Linear(in_features=30, out_features=1, bias=True)
   )
 ),
 4187511,
 377811)

In [10]:
AttractiveTrainer.train()

00<00:33,  4.39it/s]
EP_train | train loss: 0.5319905552209593 | val loss: 0.544463643840715 |
====
Epoch:   3%|▎         | 5/150 [00:01<00:33,  4.33it/s]
EP_train | train loss: 0.5318525258232566 | val loss: 0.5441018646838618 |
====
Epoch:   4%|▍         | 6/150 [00:01<00:33,  4.28it/s]
EP_train | train loss: 0.5317386000764136 | val loss: 0.5435458164589078 |
====
Epoch:   5%|▍         | 7/150 [00:01<00:32,  4.35it/s]
EP_train | train loss: 0.5317638518763523 | val loss: 0.5437382716758579 |
====
Epoch:   5%|▌         | 8/150 [00:01<00:33,  4.29it/s]
EP_train | train loss: 0.5317811891144397 | val loss: 0.5435022840312883 |
====
Epoch:   6%|▌         | 9/150 [00:02<00:32,  4.35it/s]
EP_train | train loss: 0.5317945751489378 | val loss: 0.5434601353664025 |
====
Epoch:   7%|▋         | 10/150 [00:02<00:31,  4.43it/s]
EP_train | train loss: 0.5317988143247716 | val loss: 0.5436081605799058 |
====
Epoch:   7%|▋         | 11/150 [00:02<00:30,  4.50it/s]
EP_train | train loss: 0.53164129

## for classification, not better

## Below is testing

In [11]:
from attractivenet import AttractiveNet
PATH = './model/CNN_LSTM_20201101-174320_0.3543.150'
# load_model = TransformerModel(config).to(AttractiveData.device)
load_model = AttractiveNet(config).to(AttractiveData.device)
load_model.load_state_dict(torch.load(PATH))
load_model.eval()

AttractiveNet(
  (embedding): AttractiveEmbedding(
    (token): TokenEmbedding(12699, 300, padding_idx=1)
  )
  (category_embedding): CategoryEmbedding(18, 10, padding_idx=0)
  (cnn1): Sequential(
    (0): Conv1d(300, 220, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
  )
  (cnn2): Sequential(
    (0): Conv1d(220, 150, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
  )
  (cnn3): Sequential(
    (0): Conv1d(150, 100, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
  )
  (encoder): LSTM(100, 30, batch_first=True, dropout=0.1, bidirectional=True)
  (linear): Sequential(
    (0): Linear(in_features=130, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=1, bias=True)
  )
)

In [12]:
def predict_attractive(sentence, category, phase):
    # if len(sentence) < config['max_seq']:
    #     sentence += ['0'] * (config['max_seq'] - len(sentence))
    # else:
    #     sentence = sentence[:config['max_seq']]

    indexed_sentence = [AttractiveData.TEXT.vocab.stoi[t] for t in sentence]
    indexed_category = [AttractiveData.CATEGORIES_LABEL.vocab.stoi[category]]
    tensor_sentence = torch.LongTensor(indexed_sentence).to(AttractiveData.device)
    tensor_category = torch.LongTensor(indexed_category).to(AttractiveData.device)

    tensor_sentence = tensor_sentence.unsqueeze(0)
    # print(tensor_sentence.shape)

    prediction = load_model(tensor_sentence, tensor_category, phase=phase)

    # after_decimal = prediction % 1
    # possible_list = [0.0, 0.3333333333333333, 0.6666666666666665, 0.5, 1.0]
    # closet_idx = None
    # closet_distance = 1
    # for i in range(len(possible_list)):
    #     if abs(after_decimal - possible_list[i]) <= closet_distance:
    #         closet_idx = i
    #         closet_distance = abs(after_decimal - possible_list[i])
    # prediction = (prediction // 1) + possible_list[closet_idx]
    
    return prediction

In [13]:
# train mean = 3.15, test mean = 2.8
predict_list = []
with torch.no_grad():
    for i, sentence in enumerate(AttractiveData.test_data):
        prediction = predict_attractive(sentence.Headline, sentence.Category, 'test')
        predict_list.append(prediction.item())
        # predict_list.append(prediction.item())
AttractiveData.df_test['Label'] = predict_list
AttractiveData.df_test[['ID', 'Label']].to_csv(config['save_name'] + '.csv', index=False)

## Below just for fun guess

In [14]:
import statistics
from sklearn.metrics import mean_squared_error

In [15]:
# train mean = 3.15, test mean = 2.8
train_list = []
for i, sentence in enumerate(AttractiveData.train_data):
    prediction = predict_attractive(sentence.Headline, sentence.Category, 'train')
    train_list.append(prediction.item())
    # train_list.append(prediction.item())
# print(train_list)
mean_squared_error(pd.read_csv('data/train.csv').sort_values(['ID']).Label.to_list(), train_list), statistics.mean(train_list), statistics.stdev(train_list)

(0.4556302518346933, 3.1623973927077125, 0.2513995496475463)

In [16]:
train_list[0:5], pd.read_csv('data/train.csv').sort_values(['ID']).Label.to_list()[0:5]

([3.209657669067383,
  3.208057403564453,
  3.2572128772735596,
  3.157924175262451,
  2.805985450744629],
 [4.0, 2.333333333333333, 4.5, 3.333333333333333, 4.0])

In [17]:
a = AttractiveData.df_train['Label'].to_list()
statistics.mean(a), statistics.stdev(a)

(3.150408496732026, 0.729501519321601)

In [18]:
statistics.mean(predict_list), statistics.stdev(predict_list)

(2.810208190380214, 0.23730375521165847)

In [19]:
baseline_list = pd.read_csv('baseline.csv').sort_values(['ID']).Label.to_list()
mean_squared_error(baseline_list, predict_list), statistics.mean(baseline_list), statistics.stdev(baseline_list)

(0.07887563206102266, 2.7156126782757597, 0.29355123275379763)

In [14]:
# LSTM my best
# statistics.mean(predict_list), statistics.stdev(predict_list)

(2.8167915543795683, 0.14611407210842048)

In [21]:
a = pd.read_csv('LSTM_base.csv').Label.to_list()
mean_squared_error(baseline_list, a), statistics.mean(a), statistics.stdev(a)

(0.1347375515605904, 2.8379913731293533, 0.1903582104725371)

In [14]:
AttractiveData.TEXT.vocab.itos[2]

'the'

In [26]:
AttractiveData.LABEL.vocab.freqs

Counter({'4.0': 226,
         '2.333333333333333': 194,
         '4.5': 43,
         '3.333333333333333': 313,
         '3.6666666666666665': 260,
         '2.6666666666666665': 281,
         '2.0': 135,
         '2.5': 36,
         '1.6666666666666667': 28,
         '3.0': 354,
         '4.333333333333333': 82,
         '4.666666666666667': 29,
         '1.5': 16,
         '3.5': 22,
         '1.3333333333333333': 4,
         '1.0': 5,
         '5.0': 12})

In [29]:
for i in range(1, 4):
    print(1 / i)
# 0, 0.33, 0.5, 0.66, 1

1.0
0.5
0.3333333333333333


In [31]:
0.3333333333333333 % 1

0.3333333333333333