<a href="https://colab.research.google.com/github/wei-enwang/space-ham/blob/main/main_driver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import nltk
from nltk.corpus import words
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils import data
from preprocess import WholeData

import models
import utils

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/weinwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
nltk.download('words')

device = "cuda" if torch.cuda.is_available() else "cpu"
assert device == "cuda"   # use gpu whenever you can!

seed = 32
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

[nltk_data] Downloading package words to /home/weinwang/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [9]:
plot_yes = True

# use one dataset for now
train_data_dir = "./data/enron1/"
test_data_dir = "./data/enron2/"
output_dir = "./output/"

# hyperparameters
batch_size = 64
dropout = 0.5
learning_rate = 1e-4
epochs = 30
max_len = 50

In [4]:
vocab = set([str.lower() for str in words.words()])

train_dataset = WholeData(train_data_dir, src_vocab=vocab, use_max_len=True, max_len=max_len)
test_dataset = WholeData(test_data_dir, src_vocab=vocab, use_max_len=True, max_len=max_len)
w2idx = train_dataset.src_v2id

embed = utils.load_pretrained_vectors(w2idx, "fastText/crawl-300d-2M.vec")
embed = torch.tensor(embed)

Loading pretrained vectors...
234378


0it [00:00, ?it/s]

There are 75835 / 234379 pretrained vectors found.


In [10]:
train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                                   num_workers=8, pin_memory=True)
test_dataloader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, 
                                  num_workers=8, pin_memory=True, drop_last=True)


In [11]:
model = models.spam_lstm(pretrained_embedding=embed, dropout=dropout).to(device)
loss_fn = nn.BCEWithLogitsLoss().to(device)
opt = Adam(model.parameters(), lr=learning_rate)

Using pretrained vectors...


In [None]:
utils.train_test_scheme(train_dataloader, test_dataloader, model, loss_fn, opt, 
                        task_name="w2v+lstm50_batch64", epochs=epochs, 
                        vis=plot_yes, print_every=1, img_dir=output_dir)



  3%|▎         | 1/30 [00:05<02:48,  5.80s/it]

Epoch 0
-------------------------------
Training loss: 0.628042, avg accuracy: 0.464313
Testing loss: 0.533749, avg accuracy: 0.745192


  7%|▋         | 2/30 [00:11<02:42,  5.82s/it]

Epoch 1
-------------------------------
Training loss: 0.435865, avg accuracy: 0.815275
Testing loss: 0.347613, avg accuracy: 0.855941


 10%|█         | 3/30 [00:17<02:36,  5.81s/it]

Epoch 2
-------------------------------
Training loss: 0.265758, avg accuracy: 0.928300
Testing loss: 0.330222, avg accuracy: 0.856284


 13%|█▎        | 4/30 [00:23<02:31,  5.81s/it]

Epoch 3
-------------------------------
Training loss: 0.184144, avg accuracy: 0.943153
Testing loss: 0.381219, avg accuracy: 0.827095


 17%|█▋        | 5/30 [00:29<02:25,  5.82s/it]

Epoch 4
-------------------------------
Training loss: 0.127503, avg accuracy: 0.957858
Testing loss: 0.329132, avg accuracy: 0.883757


 20%|██        | 6/30 [00:34<02:19,  5.83s/it]

Epoch 5
-------------------------------
Training loss: 0.101287, avg accuracy: 0.968750
Testing loss: 0.320446, avg accuracy: 0.883413


 23%|██▎       | 7/30 [00:40<02:13,  5.82s/it]

Epoch 6
-------------------------------
Training loss: 0.080479, avg accuracy: 0.976570
Testing loss: 0.326819, avg accuracy: 0.884615


 27%|██▋       | 8/30 [00:46<02:08,  5.83s/it]

Epoch 7
-------------------------------
Training loss: 0.064966, avg accuracy: 0.982401
Testing loss: 0.310819, avg accuracy: 0.904190


 30%|███       | 9/30 [00:52<02:02,  5.83s/it]

Epoch 8
-------------------------------
Training loss: 0.060497, avg accuracy: 0.982980
Testing loss: 0.327141, avg accuracy: 0.903159


 33%|███▎      | 10/30 [00:58<01:56,  5.83s/it]

Epoch 9
-------------------------------
Training loss: 0.052656, avg accuracy: 0.986452
Testing loss: 0.307395, avg accuracy: 0.906078


In [8]:
torch.save(model.state_dict(), output_dir+"w2v_lstmlen50_batch64.pt")