# Using Transformers to perform text classification


Using Hugging Face NLP to perform text classification
In this notebook, we show how to use  DistilBert and Long Transformers for performing text classification


In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

import torch
import transformers as tfs
from transformers import LongformerModel, LongformerTokenizer


## Data Preparation

In [3]:
# Read the Amazon Github Hun repo files and show the first few rows
amazonreviews_df = pd.read_json('./data/amazon_github_repos.json',lines=True)
amazonreviews_df.head()


Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,open_issues_count,license,forks,open_issues,watchers,default_branch,permissions,score,readme,label
0,61861755,MDEwOlJlcG9zaXRvcnk2MTg2MTc1NQ==,alexa-skills-kit-sdk-for-nodejs,alexa/alexa-skills-kit-sdk-for-nodejs,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-skills-kit-sdk-...,The Alexa Skills Kit SDK for Node.js helps you...,False,https://api.github.com/repos/alexa/alexa-skill...,...,8,"{'key': 'apache-2.0', 'name': 'Apache License ...",670,8,2811,2.0.x,"{'admin': False, 'push': False, 'pull': True}",1,"<p align=""center"">\n <img src=""https://m.medi...",API
1,84138837,MDEwOlJlcG9zaXRvcnk4NDEzODgzNw==,alexa-cookbook,alexa/alexa-cookbook,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-cookbook,A series of sample code projects to be used fo...,False,https://api.github.com/repos/alexa/alexa-cookbook,...,13,"{'key': 'other', 'name': 'Other', 'spdx_id': '...",912,13,1557,master,"{'admin': False, 'push': False, 'pull': True}",1,\n# Alexa Skill Building Cookbook\n\n<div styl...,API
2,63275452,MDEwOlJlcG9zaXRvcnk2MzI3NTQ1Mg==,skill-sample-nodejs-fact,alexa/skill-sample-nodejs-fact,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-nodejs-fact,Build An Alexa Fact Skill,False,https://api.github.com/repos/alexa/skill-sampl...,...,7,"{'key': 'apache-2.0', 'name': 'Apache License ...",1186,7,1002,master,"{'admin': False, 'push': False, 'pull': True}",1,"# Build An Alexa Fact Skill\n<img src=""https:/...",API
3,81483877,MDEwOlJlcG9zaXRvcnk4MTQ4Mzg3Nw==,avs-device-sdk,alexa/avs-device-sdk,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/avs-device-sdk,An SDK for commercial device makers to integra...,False,https://api.github.com/repos/alexa/avs-device-sdk,...,54,"{'key': 'apache-2.0', 'name': 'Apache License ...",477,54,993,master,"{'admin': False, 'push': False, 'pull': True}",1,### What is the Alexa Voice Service (AVS)?\n\n...,API
4,38904647,MDEwOlJlcG9zaXRvcnkzODkwNDY0Nw==,alexa-skills-kit-sdk-for-java,alexa/alexa-skills-kit-sdk-for-java,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-skills-kit-sdk-...,The Alexa Skills Kit SDK for Java helps you ge...,False,https://api.github.com/repos/alexa/alexa-skill...,...,2,"{'key': 'apache-2.0', 'name': 'Apache License ...",720,2,715,2.0.x,"{'admin': False, 'push': False, 'pull': True}",1,"<p align=""center"">\n <img src=""https://m.medi...",API


In [4]:
# Get the unique labels 
categories = amazonreviews_df.label.unique()
categories

array(['API', 'RESEARCH', 'GENERAL', 'OTHER', 'DEAD'], dtype=object)

In [5]:
# count each of the labels
amazonreviews_df['label'].value_counts()

API         2265
GENERAL      279
DEAD          14
RESEARCH       9
OTHER          1
Name: label, dtype: int64

In [6]:
# Focus on API and GENERAL labels only
amazonreviews_df = amazonreviews_df[amazonreviews_df['label'].isin(['API', 'GENERAL'])]
amazonreviews_df['label'].value_counts()

API        2265
GENERAL     279
Name: label, dtype: int64

In [7]:
# remove rows that contain NA
# Focus on API and GENERAL labels only
amazonreviews_df = amazonreviews_df[amazonreviews_df['readme'].notna()]
amazonreviews_df = amazonreviews_df[amazonreviews_df['label'].isin(['API', 'GENERAL'])]

# count the number of rows after removing rows that contain NA, and only rows that has a label API, General
amazonreviews_df['label'].value_counts()

API        2186
GENERAL     259
Name: label, dtype: int64

In [8]:
# Convert labels to integers

LE = LabelEncoder()
amazonreviews_df['labelcode'] = LE.fit_transform(amazonreviews_df['label'])
amazonreviews_df['labelcode'].value_counts()

0    2186
1     259
Name: labelcode, dtype: int64

## Preparing the training and test data

In [9]:
# Prepare training and test data


# Drop the columns - label and fork 
# X = amazonreviews_df.drop(['label','fork'], axis=1)
X = amazonreviews_df['readme']

# What's in the Readme for the review
print(X.head())

0    <p align="center">\n  <img src="https://m.medi...
1    \n# Alexa Skill Building Cookbook\n\n<div styl...
2    # Build An Alexa Fact Skill\n<img src="https:/...
3    ### What is the Alexa Voice Service (AVS)?\n\n...
4    <p align="center">\n  <img src="https://m.medi...
Name: readme, dtype: object


In [10]:
# get the labels
labels = amazonreviews_df.labelcode

# Split the data into train/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.30, random_state=98052)


In [11]:
# Count of label 0 and 1 in the training data set
print("Rows in X_train %d : " % len(X_train))
type(X_train.values.tolist())

y_train.value_counts()


Rows in X_train 1711 : 


0    1526
1     185
Name: labelcode, dtype: int64

In [12]:
# Count of label 0 and 1 in the test data set
y_test.value_counts()


0    660
1     74
Name: labelcode, dtype: int64

In [13]:
# Getting the dataset ready for using RandomUnderSampler
X_train_np = X_train.to_numpy()
X_test_np =  X_test.to_numpy()

# Convert 1D to 2D (used as input to sampler)
X_train_np2D = np.reshape(X_train_np,(-1,1))
X_test_np2D = np.reshape(X_test_np,(-1,1))

In [14]:
#Reference materials.
#https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html
#https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/applications/plot_topic_classication.html#sphx-glr-auto-examples-applications-plot-topic-classication-py
#https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets


# Perform random under-sampling
sampler = RandomUnderSampler(random_state = 98053)
X_train_rus, Y_train_rus = sampler.fit_resample(X_train_np2D, y_train)
X_test_rus, Y_test_rus = sampler.fit_resample(X_test_np2D, y_test)



In [14]:
from collections import Counter

print('Resampled Training dataset  %s' % Counter(Y_train_rus))
print('Resampled Test dataset %s' % Counter(Y_test_rus))

Resampled Training dataset  Counter({0: 187, 1: 187})
Resampled Test dataset Counter({0: 92, 1: 92})


In [15]:
# Preparing the resampled datasets
# flatten train and test dataset 
X_train_df = pd.DataFrame(X_train_rus.flatten())
X_test_df = pd.DataFrame(X_test_rus.flatten())


## Preparing the Dataset - Tokenization

In [16]:
# Let's start off with the DistilBert Model
# Load the DistilBERT model
model_class, tokenizer_class, pretrained_weights = (tfs.DistilBertModel, tfs.DistilBertTokenizer, 'distilbert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)


In [17]:
# You will observe that the input sequence length is greater than the 512 characters, that is supported by the DistilBert model
# There are different ways to address it.
tokenized = X_train_df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)))

In [18]:
tokenized

0      [101, 1012, 1012, 9385, 2230, 1011, 2355, 9733...
1      [101, 1001, 14925, 2015, 4708, 20497, 1001, 10...
2      [101, 1001, 1001, 22091, 2015, 3465, 10566, 31...
3      [101, 1001, 22091, 2015, 12935, 2078, 2491, 10...
4      [101, 1001, 22091, 2015, 3729, 2243, 4973, 202...
                             ...                        
365    [101, 1001, 22091, 2015, 1011, 1041, 2497, 101...
366    [101, 1008, 1008, 1008, 3602, 1024, 2023, 2622...
367    [101, 1001, 14305, 1011, 4111, 1011, 8866, 103...
368                                           [101, 102]
369                                           [101, 102]
Name: 0, Length: 370, dtype: object

## Using Longformer to handle text > 512 characters

In [18]:
# https://huggingface.co/transformers/model_doc/longformer.html\
# Paper - http://arxiv.org/pdf/2004.05150.pdf

model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [19]:
# Set Max length to be 4096 as expected by the Long Transfer, and truncation to be true
tokenized = X_train_df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=2048, truncation=True)))

In [20]:
type(tokenized)

pandas.core.series.Series

In [21]:
tokenized.shape

(370,)

## Padding 

In [19]:
# we need to pad all to the same size
# the following code double-checks that the max_len is indeed 4096
max_len = 0
for v in tokenized.values:
    if(len(v) > max_len):
        max_len = len(v)

print(max_len)

512


In [20]:
# Pad all the lists to the same size
# This will enable us to represent the input as a single 2D array
padded = np.array([v + [0] *(max_len-len(v)) for v in tokenized.values])


In [21]:
np.array(padded).shape

(370, 512)

## Masking

We need to create a variable to tell the model to ignore (mask) the padding that we have done.


In [22]:
attention_mask = np.where(padded !=0,1,0)
attention_mask.shape

(370, 512)

## Training the model

In [23]:
# model() function runs the README through LongTransformer
# Results are returned via last_hidden_states
import torch

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

last_hidden_states = None
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [24]:
 last_hidden_states

(tensor([[[-3.9827e-01,  1.3014e-02, -8.8761e-02,  ..., -2.2338e-01,
            2.6989e-01,  5.5371e-01],
          [ 1.8427e-01, -3.2230e-01,  1.0787e+00,  ...,  6.8759e-01,
            1.3747e+00, -4.5151e-01],
          [-6.9678e-01, -8.4625e-02,  8.5723e-01,  ...,  5.3232e-01,
            7.3335e-01, -1.2424e-01],
          ...,
          [ 1.3377e-01,  2.1066e-01,  3.2894e-01,  ...,  2.0280e-01,
           -3.6412e-04,  1.1656e-01],
          [ 6.9400e-01,  1.4509e-01, -1.0994e-01,  ...,  1.1199e-02,
           -9.8157e-02, -7.9229e-02],
          [ 1.1844e-01,  3.9142e-01,  2.0955e-01,  ...,  3.2869e-01,
           -3.5963e-01, -3.2384e-01]],
 
         [[-3.1393e-01, -2.1880e-01, -2.2175e-02,  ..., -3.4022e-02,
            4.4059e-01,  4.2828e-01],
          [-7.8373e-01,  3.2684e-01,  3.1467e-01,  ...,  2.8760e-01,
            3.4433e-01,  5.8296e-01],
          [-6.2095e-01,  4.3895e-01,  5.0606e-01,  ...,  4.3861e-01,
            8.9414e-01,  3.8678e-01],
          ...,
    

In [25]:
# Slice the output and save the features we need into the features variable
features = last_hidden_states[0][:,0,:].numpy()

In [26]:
labels = Y_train_rus

In [29]:
X_train, X_test, y_train, y_test = train_test_split(features, labels)

In [39]:
from sklearn.model_selection import GridSearchCV

parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(X_train, y_train)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 73.68423684210526}
best scrores:  0.704025974025974


In [43]:
grid_search.best_params_["C"]

73.68423684210526

In [44]:
lr_clf = LogisticRegression(C =grid_search.best_params_["C"] )
lr_clf.fit(X_train, y_train)

LogisticRegression(C=73.68423684210526)

In [45]:
lr_clf.score(X_test, y_test)

0.6989247311827957

In [46]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

In [47]:
scores = cross_val_score(clf, X_train,y_train)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.473 (+/- 0.11)
