In [8]:
#import libraries
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import sys
import nltk
import ssl
import re
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn import feature_selection, feature_extraction, naive_bayes, pipeline, metrics
import gensim
import gensim.downloader as gensim_api
from gensim.models import Word2Vec
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
import transformers
import torch
import gc


try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

#command to install pip packages in jupyter
#!{sys.executable} -m pip install [package]


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/riku/snap/jupyter/6/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/riku/snap/jupyter/6/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/riku/snap/jupyter/6/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
#load data
df = pd.read_csv('spectrum.csv')
df = df.dropna()
df = df[['spectrum', 'body']]

#split data 
train, test = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=1)
train, val = sklearn.model_selection.train_test_split(train, test_size=0.25, random_state=1)

In [10]:
#device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

#tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [11]:
#batch encoder
def batch_encode(text, batch_size, max_seq_len):
    input_ids_list = np.array([])
    attention_masks_list = np.array([])
    #token_type_ids_list = np.empty([len(text), max_seq_len])
    for i in range(0, len(text.tolist()), batch_size):
        encoded_sentence = tokenizer.batch_encode_plus(
            text[i : i + batch_size].tolist(),
            max_length=max_seq_len,
            add_special_tokens=True,
            padding="longest",
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            truncation=True,
            return_tensors="pt",
        )

        input_ids_list = np.append(input_ids_list, np.array(encoded_sentence["input_ids"]))
        attention_masks_list = np.append(attention_masks_list, np.array(encoded_sentence["attention_mask"]))
    
    input_ids_list = input_ids_list.reshape(-1, 512)
    attention_masks_list = attention_masks_list.reshape(-1, 512)
    return(input_ids_list, attention_masks_list)

In [12]:
#encode train data in batches
input_ids_train, attention_masks_train = batch_encode(train["body"], 10000, 512)

In [13]:
#free memory
gc.collect()

20

In [14]:
#define model
#inputs
idx = layers.Input((512), dtype="int32", name="input_idx")
masks = layers.Input((512), dtype="int32", name="input_masks")

#pre-trained bert with config
config = transformers.DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
nlp = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased',config=config)
bert_out = nlp(idx, attention_mask=masks)[0]
                                                     
#fine-tuning
x = layers.GlobalAveragePooling1D()(bert_out)
x = layers.Dense(64, activation="relu")(x)
y_out = layers.Dense(len(np.unique(train['spectrum'])), activation='softmax')(x)
                                                     
#compile model
model = models.Model([idx, masks], y_out)
for layer in model.layers[:3]:
    layer.trainable = False
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_idx (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 input_masks (InputLayer)       [(None, 512)]        0           []                               
                                                                                                  
 tf_distil_bert_model_1 (TFDist  TFBaseModelOutput(l  66362880   ['input_idx[0][0]',              
 ilBertModel)                   ast_hidden_state=(N               'input_masks[0][0]']            
                                one, 512, 768),                                                   
                                 hidden_states=None                                         

In [15]:
#free memory
gc.collect()

19050

In [16]:
#encode y

#BEST TO BE RUN ON SCHOOL COMPUTER WITH CUDA


y_train = train['spectrum']
y_test = test['spectrum']
X_test = test['body']

dic_y_mapping = {n:label for n,label in 
                 enumerate(np.unique(y_train))}
inverse_dic = {v:k for k,v in dic_y_mapping.items()}
y_train = np.array([inverse_dic[y] for y in y_train])
## train
training = model.fit(x=(input_ids_train, attention_masks_train), y=y_train, batch_size=64, 
                     epochs=1, shuffle=True, verbose=1, 
                     validation_split=0.3)

  10/1304 [..............................] - ETA: 12:32:59 - loss: 0.6982 - accuracy: 0.4875

KeyboardInterrupt: 

In [None]:
#test model

#encode test data in batches
input_ids_test, attention_masks_test = batch_encode(test["body"], 10000, 512)

#predict
predicted_prob = model.predict(input_ids_test, attention_masks_test)
predicted = [dic_y_mapping[np.argmax(pred)] for pred in 
             predicted_prob]

In [7]:
#print results

#MIGHT HAVE TO BE MODIFIED

classes = np.unique(y_test)
y_test_array = pd.get_dummies(y_test, drop_first=False).values
    
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test, predicted)
auc = metrics.roc_auc_score(y_test, predicted_prob[:,1])
                            #multi_class="ovr")
print("Accuracy:",  round(accuracy,2))
print("Auc:", round(auc,2))
print("Detail:")
print(metrics.classification_report(y_test, predicted))

fig, ax = plt.subplots(nrows=1, ncols=2)
## Plot roc
for i in range(len(classes)):
    fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],  
                           predicted_prob[:,i])
    ax[0].plot(fpr, tpr, lw=3, 
              label='{0} (area={1:0.2f})'.format(classes[i], 
                              metrics.auc(fpr, tpr))
               )
ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], 
          xlabel='False Positive Rate', 
          ylabel="True Positive Rate (Recall)", 
          title="Receiver operating characteristic")
ax[0].legend(loc="lower right")
ax[0].grid(True)
    
## Plot precision-recall curve
for i in range(len(classes)):
    precision, recall, thresholds = metrics.precision_recall_curve(
                 y_test_array[:,i], predicted_prob[:,i])
    ax[1].plot(recall, precision, lw=3, 
               label='{0} (area={1:0.2f})'.format(classes[i], 
                                  metrics.auc(recall, precision))
              )
ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', 
          ylabel="Precision", title="Precision-Recall curve")
ax[1].legend(loc="best")
ax[1].grid(True)
plt.show()

(119151, 512)