In [28]:
import ktrain
from ktrain import text
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### Globals

In [80]:
PATH_PIPE = '../Pipeline/'
PATH_RAW = '../Pipeline/raw_data/'
PATH_DATA = '../Pipeline/proc_data/'
PATH_MODELS = '../assets/models/'
PATH_IMAGES = '../assets/images/'

MULTICLASS = ['Not constructive', 'Rather not', 'Rather yes', 'Constructive']
BINARY = ['Not constructive', 'Constructive']

MODEL_NAME = 'distilbert-base-uncased'
MAXLEN = 250
MAXFEATURES = 5000
BATCH_SIZE = 6

## Load data

Either with separate train and test files or...

In [10]:
#train = pd.read_csv(PATH_DATA + "2829_train.csv")
#test = pd.read_csv(PATH_DATA + "2829_test.csv")

... with only one file.

In [70]:
data = pd.read_csv(PATH_DATA + "2840_full_data.csv")

# Prevent missing values
data = data.dropna(subset=['tag'])

X, y, y_bin = data['text_review'], data['tag'], data['bin_tag']

X_train, X_test, y_train, y_test =  [np.asarray(x) for x in train_test_split(X, y, test_size=0.25, random_state=42)]

## Load model and preprocess

In [76]:
# Setup the text transformer from ktrain
t = text.Transformer(MODEL_NAME, maxlen=MAXLEN, class_names=MULTICLASS)

# Special preprocessing step
trn = t.preprocess_train(X_train, y_train)
val = t.preprocess_test(X_test, y_test)

# Model generation
model = t.get_classifier()

preprocessing train...
language: en
train sequence lengths:
	mean : 65
	95percentile : 230
	99percentile : 518


preprocessing test...
language: en
test sequence lengths:
	mean : 64
	95percentile : 257
	99percentile : 522


We get a Learner object that wraps training, tuning and validating

In [77]:
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)

## Training and tuning

Tries different learning rates for pre-tuning

In [78]:
learner.lr_find(show_plot=True, max_epochs=2)

simulating training for different learning rates... this may take a few moments...
Train for 353 steps
Epoch 1/2
 10/353 [..............................] - ETA: 28:34 - loss: 1.3925 - accuracy: 0.2000

KeyError: 'loss'

#### Training

`fit_onecycle` gradually increases the learning rate for half of the training and then decreases it until the end.

`fit` is used with SGDR learning rate schedule.

`autofit` uses a triangular learning rate policy

In [None]:
learner.fit_onecycle(5e-5, 2)

#learner.fit(0.001, 3, cycle_len=1, cycle_mult=2)

#learner.autofit(0.001, 3)

Checking which _n_ samples induced the strongest loss.

In [None]:
learner.top_losses(preproc=t)

Validate on validation data. Outputs an sklearn classification report.

In [None]:
learner.validate(class_names=t.get_classes())

Loading a trained model

In [None]:
#predictor = ktrain.load_predictor(PATH_MODELS + model_name)

#### Prediction

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
new_data = 'Nice shoes but too small.'

In [None]:
predictor.predict(data, return_proba=True)

#### Saving the model

In [None]:
predictor.save(PATH_MODELS)

Visualize the weight of words in classification (not workng on Colab)

In [81]:
#!pip3 install git+https://github.com/amaiya/eli5@tfkeras_0_10_1

In [None]:
predictor.explain(data)