# Explore the dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import os, sys
import argparse

import torch
import torch.nn as nn
import torch.optim as optim

from model.fasttext import Fasttext
from rcnn_dataset import News20Dataset
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
#from transformers import DistilBertForSequenceClassification, Trainer, TrainingArgument
from transformers import BertConfig
from transformers import BertForSequenceClassification

from transformers import AdamW
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [2]:
# the size of training dataset
raw_data = fetch_20newsgroups(
    data_home='data/news20',
    subset='train',
    categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'],
    shuffle=False,
    remove=('headers', 'footers', 'quotes'))

In [3]:
X, y = raw_data['data'], raw_data['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42) 

train_dataset = News20Dataset(X_train, y_train, "data/glove/glove.6B.300d.txt", 200)
val_dataset = News20Dataset(X_test, y_test, "data/glove/glove.6B.300d.txt", 200) 


In [4]:
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

In [5]:
class WrappedDataLoader:
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func

    def __len__(self):
        return len(self.dl)

    def __iter__(self):
        batches = iter(self.dl)
        for b in batches:
            yield (self.func(*b))
            


def get_preprocess(dev):
    def preprocess(x, y):
        return x.to(torch.int64).to(dev), y.to(torch.int64).to(dev)
    
    return preprocess
preprocess = get_preprocess(dev)
val_dl = DataLoader(val_dataset, batch_size=32)
train_dl = DataLoader(train_dataset, batch_size=32)

val_dl = WrappedDataLoader(val_dl, preprocess)
train_dl = WrappedDataLoader(train_dl, preprocess)


In [7]:
import numpy as np
def model_eval(model, val_dl):
    model.eval()
    corr, total = 0, 0
    pre = []
    label = []
    for i, (xb, yb) in enumerate(val_dl):
            outputs = model(xb)
            pre += torch.argmax(outputs,dim=1).tolist()
            label += yb.tolist()
    pre = np.array(pre)
    label = np.array(label)
    print('precision', np.sum(pre == label)/len(pre))


In [13]:
class Fasttext(nn.Module):
    def __init__(self, output_size: int, vocab_size: int, 
                  embedding_length: int, dropout: float = 0.8):
        super(Fasttext, self).__init__()
        """
        Arguments
        ---------
        output_size : 2 = (pos, neg)
        vocab_size : Size of the vocabulary containing unique words
        embedding_length : Embedding dimension of GloVe word embeddings
        dropout : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
        """
        self.output_size = output_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
        #self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(embedding_length,100)
        self.label = nn.Linear(100, output_size)
    def forward(self, x):
        out = self.word_embeddings(x)
        out = torch.mean(out, dim=1)
        out = self.dropout(out)
        out = self.linear(out)        
        out = F.relu(out)
        out = self.label(out)
        return out

In [22]:
#self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
model = Fasttext(output_size= train_dataset.num_classes,
            vocab_size=train_dataset.vocab_size,
            embedding_length=300,
            dropout=0.2#self, batch_size, output_size, hidden_size, vocab_size, 
)
from utils import get_pretrained_weights
model.word_embeddings.weights = get_pretrained_weights("data/glove", train_dataset.vocab, 300, dev)
model.train()
model.to(dev)

Fasttext(
  (word_embeddings): Embedding(50002, 300)
  (dropout): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=300, out_features=100, bias=True)
  (label): Linear(in_features=100, out_features=4, bias=True)
)

In [23]:
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [24]:
for epoch in range(15):
    print(epoch,'--------------------------------------------')
    for i, (xb, yb) in enumerate(train_dl):
        model.train()
        outputs = model(xb)
        loss = F.cross_entropy(outputs, yb)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if (i + len(train_dl) * epoch) % 10==0:
            print('iterate %d loss %.2f' %(i + len(train_dl) * epoch, loss))
    if epoch % 1 == 0:
        model_eval(model, val_dl)

0 --------------------------------------------
iterate 0 loss 1.36
iterate 10 loss 1.39
iterate 20 loss 1.34
iterate 30 loss 1.37
iterate 40 loss 1.38
iterate 50 loss 1.36
precision 0.3553370786516854
1 --------------------------------------------
iterate 60 loss 1.28
iterate 70 loss 1.35
iterate 80 loss 1.00
iterate 90 loss 0.99
iterate 100 loss 0.98
precision 0.5196629213483146
2 --------------------------------------------
iterate 110 loss 0.91
iterate 120 loss 0.89
iterate 130 loss 1.03
iterate 140 loss 0.88
iterate 150 loss 0.48
precision 0.6390449438202247
3 --------------------------------------------
iterate 160 loss 0.70
iterate 170 loss 0.45
iterate 180 loss 0.72
iterate 190 loss 0.42
iterate 200 loss 0.50
precision 0.7064606741573034
4 --------------------------------------------
iterate 210 loss 0.69
iterate 220 loss 0.34
iterate 230 loss 0.38
iterate 240 loss 0.31
iterate 250 loss 0.39
precision 0.6671348314606742
5 --------------------------------------------
iterate 260 

KeyboardInterrupt: 

In [26]:
model.eval()
print()




In [27]:

# the size of training dataset
raw_data = fetch_20newsgroups(
    data_home='data/news20',
    subset='test',
    categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'],
    shuffle=False,
    remove=('headers', 'footers', 'quotes'))
X, y = raw_data['data'], raw_data['target']
test_dataset = News20Dataset(X_test, y_test, "data/glove/glove.6B.300d.txt", 200) 

test_dl = DataLoader(test_dataset, batch_size=32)
test_dl = WrappedDataLoader(test_dl, preprocess)

model_eval(model, test_dl)


precision 0.7542134831460674
