Using kernel `conda_pytorch_latest_p36`

In [None]:
# !pip install fastai

In [None]:
import torch

In [None]:
from pathlib import Path
import os
import random

In [None]:
from fastai.text.all import *

# First example

In [None]:
dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(1, 1e-2)

In [None]:
learn.predict("I really liked that movie!")

In [None]:
dls.show_batch()


In [None]:
data = Path('data_prep/final_data/en/')

In [None]:
raw = pd.read_csv('data_prep/data/entries_raw.csv')

In [None]:
train = pd.read_csv(data / 'sentences_en_train.csv')
test = pd.read_csv(data / 'sentences_en_test.csv')

# Get balanced data for sector 4

In [None]:
sector = 4
train_size = 1000

In [None]:
relevant_train = train[train.is_relevant == 1]
relevant_train.sector_ids = relevant_train.sector_ids.apply(eval)

In [None]:
positive_train = list(relevant_train[relevant_train.sector_ids.apply(lambda x: sector in x)].sentence_text)
negative_train = list(relevant_train[relevant_train.sector_ids.apply(lambda x: sector not in x)].sentence_text)
random.shuffle(positive_train)
random.shuffle(negative_train)

In [None]:
positive_sentences = positive_train[:train_size]
negative_sentences = negative_train[:train_size]

sentences = positive_sentences + negative_sentences
labels = [1] * train_size + [0] * train_size

all_ = [(x, y) for x, y in zip(sentences, labels)]
random.shuffle(all_)

sentences = [x[0] for x in all_]
labels = [x[1] for x in all_]

In [None]:
test = positive_train[train_size:2*train_size] + negative_train[train_size:2*train_size]
test_labels = [1] * train_size + [0] * train_size

# Train

In [None]:
# tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased")
# model = BertForSequenceClassification.from_pretrained("bert-large-uncased")

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

In [None]:
train_encodings = tokenizer(sentences, truncation=True, padding=True)
train_labels = labels
test_encodings = tokenizer(test, truncation=True, padding=True)

In [None]:
1

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
test_dataset = Dataset(test_encodings, test_labels)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=6,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=300,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=train_dataset            # evaluation dataset
)

In [None]:
len(train_dataset)

In [None]:
torch.cuda.is_available()

In [None]:
trainer.train()

In [None]:
outputs = trainer.predict(train_dataset)

In [None]:
preds = outputs.predictions
labels = outputs.label_ids
preds_max = np.argmax(preds, axis=1)
np.mean(preds_max == labels)

In [None]:
outputs = trainer.predict(test_dataset)

In [None]:
preds = outputs.predictions
labels = outputs.label_ids
preds_max = np.argmax(preds, axis=1)
np.mean(preds_max == labels)