In [1]:
#!pip3 install ktrain

In [2]:
import ktrain
from ktrain import text

ktrain.__version__

'0.12.3'

In [3]:
categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
df = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

print('size of data: %s' % (len(df['data'])))

size of data: 3759


In [4]:
X = df.data
y = df.target

from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size = 0.16)

len(X_train) # 60, 20, 20 split

2525

In [5]:
df.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [6]:
import pickle

preprocessed_data = (X_train, y_train, X_val, y_val, X_test, y_test)

pickle_out = open("preprocessed_data","wb")
pickle.dump(preprocessed_data, pickle_out)
pickle_out.close()
print('done')

done


In [7]:
# if you have preprocessed_data file, START HERE
import pickle
pickle_in = open("preprocessed_data","rb")
preprocessed_data = pickle.load(pickle_in)
X_train, y_train, X_val, y_val, X_test, y_test = preprocessed_data

In [8]:
# step 1 create a transformer instance
MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, classes=df.target_names)

# step 2 preprocess data
trn = t.preprocess_train(X_train, y_train)
val = t.preprocess_test(X_val, y_val)
test = t.preprocess_test(X_test, y_test)

# step 3 create a model and wrap in learner
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)



preprocessing train...
language: en
train sequence lengths:
	mean : 314
	95percentile : 884
	99percentile : 1992


preprocessing test...
language: en
test sequence lengths:
	mean : 361
	95percentile : 1048
	99percentile : 3373


preprocessing test...
language: en
test sequence lengths:
	mean : 324
	95percentile : 827
	99percentile : 2552


In [None]:
# step 4 [OPTIONAL] estimate the learning rate
learner.lr_find(show_plot=True, max_epochs=2)

simulating training for different learning rates... this may take a few moments...
Train for 420 steps
Epoch 1/2
 16/420 [>.............................] - ETA: 39:14 - loss: 1.3936 - accuracy: 0.2083

In [0]:
learner.fit_onecycle(2e-5, 4)



begin training using onecycle policy with max lr of 2e-05...
Train for 421 steps, validate for 16 steps
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f313e017588>

In [0]:
# step 6 [OPTIONAL] inspect the model
learner.view_top_losses(n=1, preproc=t)
print(X_test[0])

----------
id:260 | loss:6.27 | true:sci.med | pred:comp.graphics)

From: whitsebd@nextwork.rose-hulman.edu (Bryan Whitsell)
Subject: Re: "Accepting Jesus in your heart..."
Reply-To: whitsebd@nextwork.rose-hulman.edu
Organization: Computer Science Department at Rose-Hulman
Lines: 7

I have been told that I seem to be very smug in my post.  I appoligize
if anyone felt this way. I did not at all desire to come across in
that way. I was trying to express that I didn't understand his logic
and that I wished him the best in his life.

In Christ's Love,
Bryan Whitsell



In [0]:
# step 7 make predictions on new data
predictor = ktrain.get_predictor(learner.model, preproc=t)
predictor.predict('Jesus Christ is the central figure of Christianity.')

'soc.religion.christian'

In [0]:
!pip3 install git+https://github.com/amaiya/eli5@tfkeras_0_10_1

In [0]:
predictor.explain('Jesus Christ is the central figure of Christianity.')

Contribution?,Feature
3.229,jesus
2.969,christ
2.745,christianity
0.762,of christianity
0.756,christ is
0.364,the central
0.278,figure of
0.17,is the
0.048,central
-0.224,the


In [0]:
# step 8 [OPTIONAL] save and load predictor
#predictor.save('my_20newsgroup_predictor.preproc')

predictor = ktrain.load_predictor('./my_20newsgroup_predictor.preproc')
predictor.get_classes()

predictor.predict('Jesus Christ is the central figure of Christianity.')

predictor.predict_proba('Jesus Christ is the central figure of Christianity.')

array([0.00870162, 0.00348248, 0.00382773, 0.9839881 ], dtype=float32)

In [0]:
y_pred_raw = predictor.predict(X_test)

In [0]:
classes = predictor.get_classes()

classes.index(y_pred_raw[0])

3

In [0]:
classes = predictor.get_classes()

y_pred = []
for i in range(len(y_pred_raw)):
  sample = y_pred_raw[i]
  y_pred.append(classes.index(sample))

In [0]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)

acc

0.976063829787234

In [0]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

cm

array([[147,   1,   2,   2],
       [  0, 168,   0,   0],
       [  1,   3, 221,   0],
       [  7,   1,   1, 198]])

In [0]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       152
           1       0.97      1.00      0.99       168
           2       0.99      0.98      0.98       225
           3       0.99      0.96      0.97       207

    accuracy                           0.98       752
   macro avg       0.97      0.98      0.98       752
weighted avg       0.98      0.98      0.98       752

