# Explore the dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import os, sys
import argparse

import torch
import torch.nn as nn
import torch.optim as optim

from bert_dataset import News20Dataset
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
#from transformers import DistilBertForSequenceClassification, Trainer, TrainingArgument
from transformers import BertConfig
from transformers import BertForSequenceClassification

from transformers import AdamW
from torch.utils.data import DataLoader

In [3]:
# the size of training dataset
raw_data = fetch_20newsgroups(
    data_home='data/news20',
    subset='train',
    categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'],
    shuffle=False,
    remove=('headers', 'footers', 'quotes'))

In [7]:
X, y = raw_data['data'], raw_data['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42) 

train_dataset = News20Dataset(X_train, y_train, "../data/glove/glove.6B.300d.txt", 200)
val_dataset = News20Dataset(X_test, y_test, "../data/glove/glove.6B.300d.txt", 200) 


In [8]:
model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path ='bert-base-uncased',num_labels=4)
model.train()
print()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

In [11]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [12]:
dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

In [13]:
class WrappedDataLoader:
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func

    def __len__(self):
        return len(self.dl)

    def __iter__(self):
        batches = iter(self.dl)
        for b in batches:
            yield (self.func(*b))
            


def get_preprocess(tokenizer,dev):
    def preprocess(x, y):
        encoding = tokenizer(x, return_tensors='pt', padding=True, truncation=True)
        return encoding.to(dev), y.to(dev)
    
    return preprocess
preprocess = get_preprocess(tokenizer, dev)
val_dl = DataLoader(val_dataset, batch_size=1)
train_dl = DataLoader(train_dataset, batch_size=5)

val_dl = WrappedDataLoader(val_dl, preprocess)
train_dl = WrappedDataLoader(train_dl, preprocess)


In [14]:
model.to(dev)
print()




In [16]:
import numpy as np
def model_eval(model, val_dl):
    model.eval()
    corr, total = 0, 0
    pre = []
    label = []
    for i, (xb, yb) in enumerate(val_dl):
            outputs = model(**xb)
            pre += torch.argmax(outputs.logits,dim=1).tolist()
            label += yb.tolist()
    pre = np.array(pre)
    label = np.array(label)
    print('precision', np.sum(pre == label)/len(pre))
    model.train()


In [17]:
for epoch in range(5):
    for i, (xb, yb) in enumerate(train_dl):
        outputs = model(**xb, labels=yb)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 100==0:
            print('iterate %d loss %.2f' %(i, loss))
    model_eval(model, val_dl)

iterate 0 loss 0.11
iterate 100 loss 0.36
iterate 200 loss 0.34
iterate 300 loss 0.03
iterate 400 loss 0.02
precision 0.9369747899159664
iterate 0 loss 0.03
iterate 100 loss 0.51
iterate 200 loss 0.33
iterate 300 loss 0.01
iterate 400 loss 0.01
precision 0.9159663865546218
iterate 0 loss 0.02
iterate 100 loss 0.43
iterate 200 loss 0.32
iterate 300 loss 0.03
iterate 400 loss 0.00
precision 0.9285714285714286
iterate 0 loss 0.01
iterate 100 loss 0.34
iterate 200 loss 0.29
iterate 300 loss 0.00
iterate 400 loss 0.00
precision 0.9159663865546218
iterate 0 loss 0.01
iterate 100 loss 0.31
iterate 200 loss 0.25
iterate 300 loss 0.00
iterate 400 loss 0.00
precision 0.9369747899159664


In [21]:
model_eval(model, val_dl)

In [17]:
model.eval()
print()


