In [1]:
# common packages
import pandas as pd
import time

# # DL framework
import torch
from torchtext import data

from attractivedata import AttractiveData
from trainer import AttractiveTrainer

In [2]:
# import spacy
# nlp = spacy.load('en_core_web_lg')
# def tokenizer(corpus):
#     return [str(token) for token in nlp(corpus)]
# a = '"River walk that led me to my secret family: after being adopted as a child, Katharine Norbury reveals the emotional journey to reconnect with her biological mother"'.replace('"', '')
# tokenizer(a)

## Load and prepare data

In [3]:
pretrained_file = './pretrained_embedding/glove.840B.300d.txt'
train_file = 'data/train.csv'
test_file = 'data/test.csv'
max_size = 128
min_freq = 10
batch_size = 64

In [4]:
AttractiveData = AttractiveData(train_file, test_file, pretrained_file, max_size, min_freq, batch_size)

In [5]:
# AttractiveData.LABEL.vocab.freqs

In [6]:
for i, sentence in enumerate(AttractiveData.test_data):
    if i == 3:
        print(vars(AttractiveData.train_data[i]), vars(sentence))

{'Headline': ['Sorry', ',', 'i', 'spent', 'it', 'on', 'myself', '!', 'Harvey', 'Nichols', "'", 'hilarious', 'Christmas', 'advert', 'sees', 'people', 'treating', 'themselves', 'instead', 'of', 'others'], 'Label': '3.3333333333333335'} {'Headline': ['Three', 'police', 'officers', 'accused', 'of', 'stealing', '?', '?', '30k', 'during', 'raid', 'on', 'criminal'], 'Label': ''}


## Start to train

In [7]:
timestr = time.strftime("%Y%m%d-%H%M%S")
save_name = './model/AttractiveNet'
num_workers = 10
input_dim = len(AttractiveData.TEXT.vocab)
embedding_dim = 300
hidden_dim = 256
output_dim = 1
log_steps = 10
epochs = 30
lr = 1e-3
num_layers = 2
nhead = 4
dropout = 0.1
pretrained_embeddings = AttractiveData.TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([582, 300])


In [8]:
# max(AttractiveData.df_train.Headline.str.len()), max(AttractiveData.df_test.Headline.str.len())

In [9]:
AttractiveTrainer = AttractiveTrainer(save_name, log_steps, epochs, lr, timestr, AttractiveData.device, AttractiveData.trainloader, AttractiveData.testloader, input_dim, embedding_dim, hidden_dim, output_dim, pretrained_embeddings, dropout, num_layers, nhead)

In [10]:
AttractiveTrainer.model

TransformerModel(
  (embedding): AttractiveEmbedding(
    (token): TokenEmbedding(582, 300, padding_idx=0)
    (position): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
        )
        (linear1): Linear(in_features=300, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=300, bias=True)
        (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_fea

In [11]:
AttractiveTrainer.train()

EP:0 | lr: 0.001: 100%|| 32/32 [00:00<00:00, 45.20it/s]
EP: train | lr: 0.001: 100%|| 32/32 [00:00<00:00, 116.74it/s]
EP:1 | lr: 0.001:  16%|| 5/32 [00:00<00:00, 46.87it/s]
EP_train | avg_loss: 9.886443376541138 |
EP:1 | lr: 0.001: 100%|| 32/32 [00:00<00:00, 47.61it/s]
EP: train | lr: 0.001: 100%|| 32/32 [00:00<00:00, 118.23it/s]
EP:2 | lr: 0.001:  16%|| 5/32 [00:00<00:00, 47.63it/s]
EP_train | avg_loss: 9.814950540661812 |
EP:2 | lr: 0.001: 100%|| 32/32 [00:00<00:00, 47.22it/s]
EP: train | lr: 0.001: 100%|| 32/32 [00:00<00:00, 118.67it/s]
EP:3 | lr: 0.001:  16%|| 5/32 [00:00<00:00, 47.54it/s]
EP_train | avg_loss: 9.696655988693237 |
EP:3 | lr: 0.001: 100%|| 32/32 [00:00<00:00, 47.75it/s]
EP: train | lr: 0.001: 100%|| 32/32 [00:00<00:00, 115.81it/s]
EP:4 | lr: 0.001:  16%|| 5/32 [00:00<00:00, 47.03it/s]
EP_train | avg_loss: 9.788543984293938 |
EP:4 | lr: 0.001: 100%|| 32/32 [00:00<00:00, 47.62it/s]
EP: train | lr: 0.001: 100%|| 32/32 [00:00<00:00, 119.90it/s]
EP:5 | lr: 0.001:  16%|| 5

In [11]:
AttractiveTrainer.train()

EP:0 | lr: 0.001:   0%|| 0/32 [00:00<?, ?it/s]tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  0,   0, 288,  ..., 526,   0,   0],
        [849,  15, 458,  ...,   0,   8, 300],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]], device='cuda:0')
tensor([ 0.,  2., 13.,  7.,  4.,  3.,  2.,  1.,  1.,  2.,  5., 11.,  0.,  1.,
         6.,  0.,  1.,  6.,  4.,  3.,  1.,  0.,  3.,  3.,  2.,  6.,  0.,  2.,
         9.,  4., 12.,  7.,  6.,  1.,  1.,  2.,  3.,  1.,  3.,  5., 11.,  5.,
         2.,  9., 11.,  0.,  1.,  6.,  0.,  0.,  8.,  1.,  5.,  0.,  9.,  4.,
        13.,  3.,  2.,  2.,  1.,  3.,  0.,  6.], device='cuda:0')
tensor([[[ 6.0121e-01],
         [ 3.5993e-01],
         [ 2.6935e-01],
         ...,
         [ 9.3061e-01],
         [ 6.7218e-01],
         [ 5.9746e-01]],

        [[ 5.3440e-01],
         [ 5.7120e-01],
         [ 9.3283e-01],
         ...,
         [ 2.4973e-01],

ZeroDivisionError: division by zero

In [11]:
AttractiveTrainer.train()

EP:0 | lr: 0.001:   0%|| 0/32 [00:00<?, ?it/s]tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [ 48, 241,   0,  ...,   0, 805,   0],
        [211,   3,   0,  ...,   0,  13, 303],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]], device='cuda:0')
tensor([ 4.,  1.,  2.,  2.,  0., 13.,  3.,  9.,  1.,  1.,  3.,  3.,  2.,  1.,
         2.,  2.,  0.,  8.,  1.,  9.,  0.,  1.,  0.,  3.,  6., 10.,  2.,  7.,
         4.,  1.,  2.,  2.,  6.,  6.,  2., 14.,  2.,  3.,  7.,  0.,  4.,  3.,
         1., 13.,  0.,  0.,  1.,  1.,  1.,  0.,  4.,  2.,  5.,  1.,  3.,  5.,
         0.,  2.,  0.,  8.,  1.,  0.,  9.,  1.], device='cuda:0')
tensor([[-0.2539],
        [-0.8596],
        [-1.1908],
        [-1.5457],
        [-0.2728],
        [-0.9879],
        [-0.8742],
        [-1.1552],
        [-1.1962],
        [-1.4770],
        [-0.5963],
        [-1.6508],
        [-0.5904],
        [-1.2092],
   

ZeroDivisionError: division by zero

In [12]:
def predict_attractive(sentence):
    tokens = AttractiveData.tokenizer(sentence)
    indexed = [AttractiveData.TEXT.vocab.stoi[t] for t in tokens]
    print(indexed)
    tensor = torch.LongTensor(indexed).to(AttractiveData.device)

    tensor = tensor.unsqueeze(1)
    print(tensor.shape)
    prediction = AttractiveTrainer.model(tensor)

    print(prediction.shape)
    
    return prediction

In [14]:
a = "Body of man, 34, is found at recycling plant after being delivered with rubbish"
prediction = predict_attractive(a)
prediction

[0, 7, 97, 12, 0, 12, 20, 139, 22, 0, 1085, 21, 84, 0, 16, 0]
torch.Size([16, 1])
torch.Size([1, 1])


tensor([[2.6075]], device='cuda:0', grad_fn=<AddmmBackward>)

In [12]:
pred, true = AttractiveTrainer.evaluate(AttractiveTrainer.test_loader, 'test')

EP: test | lr: 0.01:   0%|| 0/4 [00:00<?, ?it/s]


AttributeError: 'Example' object has no attribute 'Text'

## Below is testing

In [17]:
a = AttractiveData.df_test['ID'].to_list()

In [17]:
from sklearn.metrics import mean_squared_error

In [9]:
mean_squared_error(a, b)
# Training all 3.0 got mse = 0.5545

0.5545343137254902

In [18]:
# only for fun
import random
guess_list = [2.3333333333333335, 3.3333333333333335, 3.6666666666666665, 2.6666666666666665]
b = []
for i in range(len(a)):
    b.append(random.choice(guess_list))

In [19]:
AttractiveData.df_test['Label'] = b

In [20]:
AttractiveData.df_test[['ID', 'Label']].to_csv('all_3.csv', index=False)