In [86]:
import pandas as pd
import os

In [87]:
ROOT = os.path.abspath('/home/mod/Workspace/github/facebook-hackathon/data/')

In [88]:
fake_data = pd.read_csv(os.path.join(ROOT, 'fake_cleaned.csv'))

In [89]:
fake_data.head()

Unnamed: 0,text,label
0,Print They should pay all the back all the mon...,fake
1,Why Did Attorney General Loretta Lynch Plead T...,fake
2,Red State : \nFox News Sunday reported this mo...,fake
3,Email Kayla Mueller was a prisoner and torture...,fake
4,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,fake


In [27]:
from torchtext import data
import spacy

In [43]:
NLP = spacy.load('en_core_web_sm')
tokenizer = lambda sent: [x.text for x in NLP.tokenizer(sent) if x.text != " "]

In [44]:
# Convert DataFrame to torchtext.data.Dataset
text = data.Field(
    sequential=True, tokenize=tokenizer, lower=True,
    fix_length=60,
)
label = data.Field(sequential=False, use_vocab=False)
data_fields = [("text", text), ("label", label)]

In [45]:
train_examples = [data.Example.fromlist(i, data_fields) 
                  for i in train_df.values.tolist()]

In [47]:
train_data = data.Dataset(train_examples, data_fields)

In [53]:
text.build_vocab(train_data)

{'text': <torchtext.data.field.Field at 0x7f3073d872d0>,
 'label': <torchtext.data.field.Field at 0x7f3073d87310>}

In [2]:
import torch
import torchtext
from torchtext.datasets import text_classification

NGRAMS = 2

if not os.path.isdir('./data'):
    os.mkdir('./data')

train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./data', ngrams=NGRAMS, vocab=None)
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ag_news_csv.tar.gz: 11.8MB [00:03, 3.65MB/s]
120000lines [00:04, 24528.18lines/s]
120000lines [00:09, 12555.01lines/s]
7600lines [00:00, 12718.15lines/s]


In [20]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [21]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [22]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [23]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [24]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 5 seconds
	Loss: 0.0261(train)	|	Acc: 84.8%(train)
	Loss: 0.0001(valid)	|	Acc: 90.2%(valid)
Epoch: 2  | time in 0 minutes, 5 seconds
	Loss: 0.0119(train)	|	Acc: 93.6%(train)
	Loss: 0.0000(valid)	|	Acc: 90.1%(valid)
Epoch: 3  | time in 0 minutes, 5 seconds
	Loss: 0.0069(train)	|	Acc: 96.3%(train)
	Loss: 0.0001(valid)	|	Acc: 91.0%(valid)
Epoch: 4  | time in 0 minutes, 6 seconds
	Loss: 0.0039(train)	|	Acc: 98.1%(train)
	Loss: 0.0001(valid)	|	Acc: 90.7%(valid)
Epoch: 5  | time in 0 minutes, 5 seconds
	Loss: 0.0022(train)	|	Acc: 99.0%(train)
	Loss: 0.0002(valid)	|	Acc: 91.1%(valid)
