In [1]:
# common packages
import pandas as pd
import time

# # DL framework
import torch
from torchtext import data

from attractivedata import AttractiveData
from trainer import AttractiveTrainer

In [2]:
# import spacy
# nlp = spacy.load('en_core_web_lg')
# def tokenizer(corpus):
#     return [str(token) for token in nlp(corpus)]
# a = '"River walk that led me to my secret family: after being adopted as a child, Katharine Norbury reveals the emotional journey to reconnect with her biological mother"'.replace('"', '')
# tokenizer(a)

## Load and prepare data

In [3]:
pretrained_file = './pretrained_embedding/glove.840B.300d.txt'
train_file = 'data/train.csv'
test_file = 'data/test.csv'
max_size = 256
min_freq = 2
batch_size = 64

In [4]:
AttractiveData = AttractiveData(train_file, test_file, pretrained_file, max_size, min_freq, batch_size)

In [5]:
AttractiveData.LABEL.vocab.freqs

Counter({'4.0': 226,
         '2.3333333333333335': 194,
         '4.5': 43,
         '3.3333333333333335': 313,
         '3.6666666666666665': 260,
         '2.6666666666666665': 281,
         '2.0': 135,
         '2.5': 36,
         '1.6666666666666667': 28,
         '3.0': 354,
         '4.333333333333333': 82,
         '4.666666666666667': 29,
         '1.5': 16,
         '3.5': 22,
         '1.3333333333333333': 4,
         '1.0': 5,
         '5.0': 12})

In [6]:
batch = next(iter(AttractiveData.trainloader))
batch


[torchtext.data.batch.Batch of size 64]
	[.Headline]:[torch.cuda.LongTensor of size 256x64 (GPU 0)]
	[.Label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]

## Start to train

In [7]:
timestr = time.strftime("%Y%m%d-%H%M%S")
save_name = './model/AttractiveNet'
num_workers = 10
input_dim = len(AttractiveData.TEXT.vocab)
embedding_dim = 300
hidden_dim = 374
output_dim = 1
log_steps = 10
epochs = 100
lr = 1e-2
pretrained_embeddings = AttractiveData.TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([4317, 300])


In [8]:
# max(AttractiveData.df_train.Headline.str.len()), max(AttractiveData.df_test.Headline.str.len())

In [9]:
AttractiveTrainer = AttractiveTrainer(save_name, log_steps, epochs, lr, timestr, AttractiveData.device, AttractiveData.trainloader, AttractiveData.testloader, input_dim, embedding_dim, hidden_dim, output_dim, pretrained_embeddings)

In [10]:
AttractiveTrainer.train()

EP:0 | lr: 0.01: 100%|| 32/32 [00:00<00:00, 58.97it/s]
EP: train | lr: 0.01: 100%|| 32/32 [00:00<00:00, 127.09it/s]
EP:1 | lr: 0.01:  22%|| 7/32 [00:00<00:00, 66.89it/s]
EP_train | avg_loss: 9.676699832081795 |
EP:1 | lr: 0.01: 100%|| 32/32 [00:00<00:00, 68.40it/s]
EP: train | lr: 0.01: 100%|| 32/32 [00:00<00:00, 136.99it/s]
EP:2 | lr: 0.01:  22%|| 7/32 [00:00<00:00, 63.87it/s]
EP_train | avg_loss: 9.72986763715744 |
EP:2 | lr: 0.01: 100%|| 32/32 [00:00<00:00, 66.28it/s]
EP: train | lr: 0.01: 100%|| 32/32 [00:00<00:00, 139.10it/s]
EP:3 | lr: 0.01:  22%|| 7/32 [00:00<00:00, 64.69it/s]
EP_train | avg_loss: 9.688860148191452 |
EP:3 | lr: 0.01: 100%|| 32/32 [00:00<00:00, 65.94it/s]
EP: train | lr: 0.01: 100%|| 32/32 [00:00<00:00, 138.92it/s]
EP:4 | lr: 0.01:  22%|| 7/32 [00:00<00:00, 67.05it/s]
EP_train | avg_loss: 9.678667455911636 |
EP:4 | lr: 0.01: 100%|| 32/32 [00:00<00:00, 67.80it/s]
EP: train | lr: 0.01: 100%|| 32/32 [00:00<00:00, 137.02it/s]
EP:5 | lr: 0.01:  22%|| 7/32 [00:00<00:00

KeyboardInterrupt: 

In [25]:
def predict_attractive(sentence):
    tokens = AttractiveData.tokenizer(sentence)
    indexed = [AttractiveData.TEXT.vocab.stoi[t] for t in tokens]
    tensor = torch.LongTensor(indexed).to(AttractiveData.device)

    tensor = tensor.unsqueeze(1)
    prediction = AttractiveTrainer.model(tensor)
    
    return prediction[0][0][0].item()

In [26]:
a = "Traditional Bombay-style Cafe beats Heston Blumenthal's Michelin-starred restaurant to be crowned the best in the Uk"
prediction = predict_attractive(a)
prediction

0.8195406794548035

In [12]:
pred, true = AttractiveTrainer.evaluate(AttractiveTrainer.test_loader, 'test')

EP: test | lr: 0.01:   0%|| 0/4 [00:00<?, ?it/s]


AttributeError: 'Example' object has no attribute 'Text'

## Below is testing

In [10]:
a = df_test['ID'].to_list()

In [17]:
from sklearn.metrics import mean_squared_error

In [9]:
mean_squared_error(a, b)
# Training all 3.0 got mse = 0.5545

0.5545343137254902

In [20]:
df_test['Label'] = b

In [21]:
df_test[['ID', 'Label']].to_csv('all_3.csv', index=False)