In [1]:
# common packages
import pandas as pd
import time

# # DL framework
import torch
from torchtext import data

from attractivedata import AttractiveData
from trainer import AttractiveTrainer

## Load and prepare data

In [2]:
train_file = 'data/train.csv'
test_file = 'data/test.csv'
pretrained_file = 'glove.6B.200d'
config = {
    'max_size': 64,
    'min_freq': 5,
    'batch_size': 64,
    'pretrained_file': pretrained_file
}


In [3]:
AttractiveData = AttractiveData(train_file, test_file, pretrained_file, config)

In [4]:
for i, sentence in enumerate(AttractiveData.test_data):
    if i == 3:
        print(vars(AttractiveData.train_data[i]), vars(sentence))

{'Headline': ['Sorry', ',', 'i', 'spent', 'it', 'on', 'myself', '!', 'Harvey', 'Nichols', "'", 'hilarious', 'Christmas', 'advert', 'sees', 'people', 'treating', 'themselves', 'instead', 'of', 'others'], 'Category': 'femail', 'Label': '3.333333333333333'} {'Headline': ['Three', 'police', 'officers', 'accused', 'of', 'stealing', '?', '?', '30k', 'during', 'raid', 'on', 'criminal'], 'Category': 'news'}


In [5]:
len(AttractiveData.CATEGORIES_LABEL.vocab.freqs)

18

In [6]:
max_len = 0
for i in range(i):
    if len(AttractiveData.test_data[i].Headline) >= max_len:
        max_len = len(AttractiveData.test_data[i].Headline)
max_len

31

## Start to train

In [7]:
num_workers = 10

config['timestr'] = time.strftime("%Y%m%d-%H%M%S")
config['save_name'] = 'LSTM'
config['input_dim'] = len(AttractiveData.TEXT.vocab)
config['embedding_dim'] = 300
config['category_dim'] = len(AttractiveData.CATEGORIES_LABEL.vocab)
config['category_embedding_dim'] = 16
config['hidden_dim'] = 64
config['output_dim'] = 1
config['log_steps'] = 10
config['epochs'] = 100
config['lr'] = {
    'encoder': 1e-4,
    'embedding': 1e-5,
    'linear': 1e-4
}
config['num_layers'] = 2
config['nhead'] = 4
config['dropout'] = 0.1

pretrained_embeddings = AttractiveData.TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([1518, 200])


In [8]:
# max(AttractiveData.df_train.Headline.str.len()), max(AttractiveData.df_test.Headline.str.len())

In [9]:
AttractiveTrainer = AttractiveTrainer(config, AttractiveData.device, AttractiveData.trainloader, pretrained_embeddings)

In [10]:
AttractiveTrainer.model, AttractiveTrainer.config['total_params'], AttractiveTrainer.config['total_learned_params']

(AttractiveNet(
   (embedding): AttractiveEmbedding(
     (token): TokenEmbedding(1518, 200, padding_idx=1)
     (position): PositionalEmbedding()
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (category_embedding): CategoryEmbedding(18, 16, padding_idx=0)
   (encoder): LSTM(200, 64, num_layers=2, dropout=0.1, bidirectional=True)
   (linear_output): Linear(in_features=144, out_features=1, bias=True)
 ),
 539553,
 235953)

In [11]:
AttractiveTrainer.train()

Epoch:   1%|          | 1/100 [00:00<00:48,  2.02it/s]
EP_train | avg_loss: 6.642789602279663 |
Epoch:   2%|▏         | 2/100 [00:00<00:47,  2.06it/s]
EP_train | avg_loss: 1.399132400751114 |
Epoch:   3%|▎         | 3/100 [00:01<00:46,  2.09it/s]
EP_train | avg_loss: 0.5566003285348415 |
Epoch:   4%|▍         | 4/100 [00:01<00:45,  2.11it/s]
EP_train | avg_loss: 0.5459084836766124 |
Epoch:   5%|▌         | 5/100 [00:02<00:44,  2.11it/s]
EP_train | avg_loss: 0.54395058657974 |
Epoch:   6%|▌         | 6/100 [00:02<00:44,  2.09it/s]
EP_train | avg_loss: 0.5424360195174813 |
Epoch:   7%|▋         | 7/100 [00:03<00:45,  2.06it/s]
EP_train | avg_loss: 0.5435706898570061 |
Epoch:   8%|▊         | 8/100 [00:03<00:45,  2.03it/s]
EP_train | avg_loss: 0.5438382206484675 |
Epoch:   9%|▉         | 9/100 [00:04<00:44,  2.06it/s]
EP_train | avg_loss: 0.5446662735193968 |
Epoch:  10%|█         | 10/100 [00:04<00:43,  2.09it/s]
EP_train | avg_loss: 0.5431200014427304 |
Epoch:  11%|█         | 11/100 [0

## for classification, not better

In [14]:
# from sklearn.metrics import mean_squared_error
# a = AttractiveTrainer.train_predict
# AttractiveData.LABEL.vocab.itos[int(a[0])], AttractiveTrainer.train_true[0]
# correct = 0
# pred_list = []
# true_list = []
# for i in range(len(a)):
#     pred = AttractiveData.LABEL.vocab.itos[int(a[i])]
#     pred_list.append(float(pred))
#     true = AttractiveData.LABEL.vocab.itos[int(AttractiveTrainer.train_true[i])]
#     true_list.append(float(true))
# mean_squared_error(true_list, pred_list)
# # true_list

0.5601443355119825

## Below is testing

In [9]:
from transformermodel import TransformerModel
from attractivenet import AttractiveNet
PATH = './model/LSTM_20201031-114350_0.5422.100'
# load_model = TransformerModel(config).to(AttractiveData.device)
load_model = AttractiveNet(config).to(AttractiveData.device)
load_model.load_state_dict(torch.load(PATH))
load_model.eval()

RuntimeError: Error(s) in loading state_dict for AttractiveNet:
	size mismatch for embedding.token.weight: copying a param with shape torch.Size([1518, 300]) from checkpoint, the shape in current model is torch.Size([1518, 200]).
	size mismatch for embedding.position.pe: copying a param with shape torch.Size([1, 128, 300]) from checkpoint, the shape in current model is torch.Size([1, 128, 200]).
	size mismatch for encoder.weight_ih_l0: copying a param with shape torch.Size([256, 300]) from checkpoint, the shape in current model is torch.Size([256, 200]).
	size mismatch for encoder.weight_ih_l0_reverse: copying a param with shape torch.Size([256, 300]) from checkpoint, the shape in current model is torch.Size([256, 200]).

In [21]:
def predict_attractive(sentence, category):
    indexed_sentence = [AttractiveData.TEXT.vocab.stoi[t] for t in sentence]
    indexed_category = [AttractiveData.CATEGORIES_LABEL.vocab.stoi[category]]
    tensor_sentence = torch.LongTensor(indexed_sentence).to(AttractiveData.device)
    tensor_category = torch.LongTensor(indexed_category).to(AttractiveData.device)

    tensor_sentence = tensor_sentence.unsqueeze(1)
    tensor_category = tensor_category

    prediction = load_model(tensor_sentence, tensor_category)
    
    return prediction

In [14]:
# train mean = 3.2, test mean = 2.8
predict_list = []
for i, sentence in enumerate(AttractiveData.test_data):
    prediction = predict_attractive(sentence.Headline, sentence.Category)
    # predict_list.append(prediction.item() - 3.2 + 2.8)
    predict_list.append(prediction.item())
AttractiveData.df_test['Label'] = predict_list
AttractiveData.df_test[['ID', 'Label']].to_csv(config['save_name'] + '.csv', index=False)

In [15]:
# train_category = list(AttractiveData.CATEGORIES_LABEL.vocab.freqs)
# test_category = list(AttractiveData.df_test['Category'].value_counts().keys())
# for each_test in test_category:
#     if each_test not in train_category:
#         print(each_test)
# print()
# for each_train in train_category:
#     if each_train not in test_category:
#         print(each_train)

## Below just for fun guess

In [16]:
import statistics
from sklearn.metrics import mean_squared_error

In [17]:
a = AttractiveData.df_train['Label'].to_list()
statistics.mean(a), statistics.stdev(a)

(3.150408496732026, 0.729501519321601)

In [18]:
statistics.mean(predict_list), statistics.stdev(predict_list)

(2.7234197466383945, 0.15732692202737075)

In [19]:
all_28 = [2.8] * len(predict_list)
mean_squared_error(all_28, predict_list)

0.030507257007487312

In [14]:
# LSTM my best
# statistics.mean(predict_list), statistics.stdev(predict_list)

(2.8167915543795683, 0.14611407210842048)