In [1]:
# common packages
import pandas as pd
import time

# # DL framework
import torch
from torchtext import data

from attractivedata import AttractiveData
from trainer import AttractiveTrainer

In [2]:
torch.manual_seed(123)
torch.cuda.manual_seed(123)
torch.backends.cudnn.deterministic = True

## Load and prepare data

In [3]:
train_file = 'data/train.csv'
test_file = 'data/test.csv'
pretrained_file = 'glove.840B.300d'
config = {
    'max_seq': 48,
    'min_freq': 0,
    'batch_size': 32,
    'pretrained_file': pretrained_file
}


In [4]:
AttractiveData = AttractiveData(train_file, test_file, pretrained_file, config)

In [5]:
for i, sentence in enumerate(AttractiveData.test_data):
    if i == 3:
        print(vars(AttractiveData.train_data[i]), vars(sentence))

{'Headline': ['sorry,', 'i', 'spent', 'it', 'on', 'myself!', 'harvey', "nichols'", 'hilarious', 'christmas', 'advert', 'sees', 'people', 'treating', 'themselves', 'instead', 'of', 'others'], 'Category': 'femail', 'Label': '0.18333333333333313'} {'Headline': ['three', 'police', 'officers', 'accused', 'of', 'stealing', '??', '30k', 'during', 'raid', 'on', 'criminal'], 'Category': 'news'}


In [6]:
len(AttractiveData.CATEGORIES_LABEL.vocab.freqs)

18

In [7]:
max_len = 0
a = AttractiveData.train_data
for i in range(len(a)):
    if len(a[i].Headline) >= max_len:
        max_len = len(a[i].Headline)
max_len

38

## Start to train

In [8]:
config['timestr'] = time.strftime("%Y%m%d-%H%M%S")
config['save_name'] = 'LSTM'
config['input_dim'] = len(AttractiveData.TEXT.vocab)
config['embedding_dim'] = 300
config['category_dim'] = len(AttractiveData.CATEGORIES_LABEL.vocab)
config['category_embedding_dim'] = 16
config['hidden_dim'] = 30
config['output_dim'] = 1
config['log_steps'] = 10
config['epochs'] = 200
config['lr'] = {
    'encoder': 1e-5,
    'embedding': 1e-5,
    'linear': 1e-5
}
config['num_layers'] = 1
config['kernel_size'] = 3
config['dropout'] = 0.0

pretrained_embeddings = AttractiveData.TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([12699, 300])


In [9]:
AttractiveTrainer = AttractiveTrainer(config, AttractiveData.device, AttractiveData.trainloader, pretrained_embeddings)

In [10]:
AttractiveTrainer.model, AttractiveTrainer.config['total_params'], AttractiveTrainer.config['total_learned_params']

(AttractiveNet(
   (embedding): AttractiveEmbedding(
     (token): TokenEmbedding(12699, 300, padding_idx=1)
   )
   (encoder): LSTM(300, 30, bidirectional=True)
   (linear): Sequential(
     (0): Linear(in_features=120, out_features=30, bias=True)
     (1): ReLU()
     (2): Linear(in_features=30, out_features=1, bias=True)
   )
 ),
 3893041,
 83341)

In [11]:
config

{'max_seq': 48,
 'min_freq': 0,
 'batch_size': 32,
 'pretrained_file': 'glove.840B.300d',
 'timestr': '20201101-010910',
 'save_name': 'LSTM',
 'input_dim': 12699,
 'embedding_dim': 300,
 'category_dim': 18,
 'category_embedding_dim': 16,
 'hidden_dim': 30,
 'output_dim': 1,
 'log_steps': 10,
 'epochs': 200,
 'lr': {'encoder': 1e-05, 'embedding': 1e-05, 'linear': 1e-05},
 'num_layers': 1,
 'kernel_size': 3,
 'dropout': 0.0,
 'total_params': 3893041,
 'total_learned_params': 83341}

In [12]:
AttractiveTrainer.train()

Epoch:   0%|          | 1/200 [00:00<01:23,  2.39it/s]
EP_train | avg_loss: 0.5605409261770546 |
Epoch:   1%|          | 2/200 [00:00<01:19,  2.48it/s]
EP_train | avg_loss: 0.560181456618011 |
Epoch:   2%|▏         | 3/200 [00:01<01:17,  2.56it/s]
EP_train | avg_loss: 0.5603159982711077 |
Epoch:   2%|▏         | 4/200 [00:01<01:14,  2.63it/s]
EP_train | avg_loss: 0.5602295058779418 |
Epoch:   2%|▎         | 5/200 [00:01<01:12,  2.69it/s]
EP_train | avg_loss: 0.560076741501689 |
Epoch:   3%|▎         | 6/200 [00:02<01:11,  2.73it/s]
EP_train | avg_loss: 0.5597224552184343 |
Epoch:   4%|▎         | 7/200 [00:02<01:11,  2.69it/s]
EP_train | avg_loss: 0.5600840882398188 |
Epoch:   4%|▍         | 8/200 [00:02<01:10,  2.74it/s]
EP_train | avg_loss: 0.5590838897041976 |
Epoch:   4%|▍         | 9/200 [00:03<01:08,  2.78it/s]
EP_train | avg_loss: 0.5585224544629455 |
Epoch:   5%|▌         | 10/200 [00:03<01:11,  2.66it/s]
EP_train | avg_loss: 0.5588882807642221 |
Epoch:   6%|▌         | 11/200 

## for classification, not better

In [14]:
# from sklearn.metrics import mean_squared_error
# a = AttractiveTrainer.train_predict
# AttractiveData.LABEL.vocab.itos[int(a[0])], AttractiveTrainer.train_true[0]
# correct = 0
# pred_list = []
# true_list = []
# for i in range(len(a)):
#     pred = AttractiveData.LABEL.vocab.itos[int(a[i])]
#     pred_list.append(float(pred))
#     true = AttractiveData.LABEL.vocab.itos[int(AttractiveTrainer.train_true[i])]
#     true_list.append(float(true))
# mean_squared_error(true_list, pred_list)
# # true_list

0.5601443355119825

## Below is testing

In [13]:
from attractivenet import AttractiveNet
PATH = './model/LSTM_20201101-010910_0.5378.200'
# load_model = TransformerModel(config).to(AttractiveData.device)
load_model = AttractiveNet(config).to(AttractiveData.device)
load_model.load_state_dict(torch.load(PATH))
load_model.eval()

AttractiveNet(
  (embedding): AttractiveEmbedding(
    (token): TokenEmbedding(12699, 300, padding_idx=1)
  )
  (encoder): LSTM(300, 30, num_layers=2, bidirectional=True)
  (linear): Sequential(
    (0): Linear(in_features=120, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=1, bias=True)
  )
)

In [14]:
def predict_attractive(sentence, category):
    if len(sentence) < config['max_seq']:
        sentence += ['0'] * (config['max_seq'] - len(sentence))
    else:
        sentence = sentence[:config['max_seq']]

    indexed_sentence = [AttractiveData.TEXT.vocab.stoi[t] for t in sentence]
    indexed_category = [AttractiveData.CATEGORIES_LABEL.vocab.stoi[category]]
    tensor_sentence = torch.LongTensor(indexed_sentence).to(AttractiveData.device)
    tensor_category = torch.LongTensor(indexed_category).to(AttractiveData.device)

    tensor_sentence = tensor_sentence.unsqueeze(1)

    prediction = load_model(tensor_sentence, tensor_category)
    
    return prediction

In [15]:
# train mean = 3.15, test mean = 2.8
predict_list = []
with torch.no_grad():
    for i, sentence in enumerate(AttractiveData.test_data):
        # print(i)
        # print(sentence.Headline)
        prediction = predict_attractive(sentence.Headline, sentence.Category)
        predict_list.append(prediction.item() + 2.8)
        # predict_list.append(prediction.item())
AttractiveData.df_test['Label'] = predict_list
AttractiveData.df_test[['ID', 'Label']].to_csv(config['save_name'] + '.csv', index=False)

In [16]:
# train_category = list(AttractiveData.CATEGORIES_LABEL.vocab.freqs)
# test_category = list(AttractiveData.df_test['Category'].value_counts().keys())
# for each_test in test_category:
#     if each_test not in train_category:
#         print(each_test)
# print()
# for each_train in train_category:
#     if each_train not in test_category:
#         print(each_train)

## Below just for fun guess

In [17]:
import statistics
from sklearn.metrics import mean_squared_error

In [18]:
# train mean = 3.15, test mean = 2.8
train_list = []
for i, sentence in enumerate(AttractiveData.train_data):
    prediction = predict_attractive(sentence.Headline, sentence.Category)
    train_list.append(prediction.item() + 3.15)
    # train_list.append(prediction.item())
mean_squared_error(pd.read_csv('data/train.csv').sort_values(['ID']).Label.to_list(), train_list), statistics.mean(train_list), statistics.stdev(train_list)

(0.5347357640381534, 3.212537875745044, 0.009437110101923997)

In [19]:
a = AttractiveData.df_train['Label'].to_list()
statistics.mean(a), statistics.stdev(a)

(0.0004084967320261136, 0.7295015193216009)

In [20]:
statistics.mean(predict_list), statistics.stdev(predict_list)

(2.8603686607225347, 0.0100302224724178)

In [21]:
baseline_list = pd.read_csv('baseline.csv').sort_values(['ID']).Label.to_list()
mean_squared_error(baseline_list, predict_list), statistics.mean(baseline_list), statistics.stdev(baseline_list)

(0.10607111434061829, 2.7156126782757597, 0.29355123275379763)

In [14]:
# LSTM my best
# statistics.mean(predict_list), statistics.stdev(predict_list)

(2.8167915543795683, 0.14611407210842048)

In [21]:
a = pd.read_csv('LSTM_base.csv').Label.to_list()
mean_squared_error(baseline_list, a), statistics.mean(a), statistics.stdev(a)

(0.1347375515605904, 2.8379913731293533, 0.1903582104725371)