ref: [multi-label bert](https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a)

In [1]:
!pip install -i https://mirrors.ustc.edu.cn/pypi/web/simple -q transformers

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

## Data

### get

In [4]:
!wget -nc http://files.consumerfinance.gov/ccdb/complaints.csv.zip -P ./data

wget: /data2/wangyh/anaconda3/lib/libuuid.so.1: no version information available (required by wget)
File ‘./data/complaints.csv.zip’ already there; not retrieving.



In [5]:
!unzip -n ./data/complaints.csv.zip -d ./data

Archive:  ./data/complaints.csv.zip


In [6]:
!head -n 2 ./data/complaints.csv

Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
2019-09-24,Debt collection,I do not know,Attempts to collect debt not owed,Debt is not yours,"transworld systems inc. 


### load

In [7]:
data = pd.read_csv('./data/complaints.csv')

### preprocess

In [8]:
data = data[['Consumer complaint narrative', 'Product', 'Issue']]

In [9]:
data = data.dropna()

In [10]:
data['Issue'].drop_duplicates()

0                          Attempts to collect debt not owed
2                                      Communication tactics
12         Problem with a credit reporting company's inve...
17                      Incorrect information on your report
26                             Struggling to repay your loan
                                 ...                        
1575204                        Shopping for a line of credit
1575407                  Lender damaged or destroyed vehicle
1578753                          Managing the line of credit
1593458                             Lender sold the property
1613542                 Lender damaged or destroyed property
Name: Issue, Length: 160, dtype: object

In [11]:
# Set your model output as categorical and save in new label col
data['Issue_label'] = pd.Categorical(data['Issue'])
data['Product_label'] = pd.Categorical(data['Product'])
data.head(1)

Unnamed: 0,Consumer complaint narrative,Product,Issue,Issue_label,Product_label
0,transworld systems inc. \nis trying to collect...,Debt collection,Attempts to collect debt not owed,Attempts to collect debt not owed,Debt collection


In [12]:
# Transform your output to numeric
data['Issue'] = data['Issue_label'].cat.codes
data['Product'] = data['Product_label'].cat.codes
data.head(1)

Unnamed: 0,Consumer complaint narrative,Product,Issue,Issue_label,Product_label
0,transworld systems inc. \nis trying to collect...,7,13,Attempts to collect debt not owed,Debt collection


In [13]:
data['Issue'].drop_duplicates()

0           13
2           33
12         111
17          72
26         137
          ... 
1575204    132
1575407     76
1578753     86
1593458     78
1613542     75
Name: Issue, Length: 160, dtype: int16

### train & test

In [14]:
# TODO: for test only
data = data.head(int(6e4))

In [15]:
# Remove rows, where the label is present only ones (can't be split)
data = data.groupby('Issue').filter(lambda x : len(x) > 1)
data = data.groupby('Product').filter(lambda x : len(x) > 1)

In [16]:
n_issue = data['Issue'].max() + 1
n_product = data['Product'].max() + 1

In [17]:
# Split into train and test - stratify over Issue
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']])

In [18]:
print(data.shape)
print(data_test.shape)

(47991, 5)
(11998, 5)


In [19]:
print(data['Product'].value_counts().shape)
print(data['Product_label'].value_counts().shape)

data['Product'].value_counts()

(14,)
(18,)


6     21974
7      9889
4      4640
10     3790
1      2773
15     1630
16     1162
8      1120
13      967
5        14
2        12
3         8
12        8
0         4
Name: Product, dtype: int64

In [20]:
print(data_test['Product'].value_counts().shape)
print(data_test['Product_label'].value_counts().shape)

(13,)
(18,)


## Model

### pretrained

In [21]:
# Name of the BERT model to use
model_name = 'bert-base-uncased'

In [22]:
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

In [23]:
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

In [24]:
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


### classification

#### inputs

In [25]:
# Max length of tokens
max_length = 100

In [26]:
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}

#### hiddens

In [27]:
# Load the MainLayer
bert = transformer_model.layers[0]
bert

<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7f5962761e50>

In [28]:
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]

In [29]:
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

#### outputs

In [30]:
issue = Dense(
    # units=len(data.Issue_label.value_counts()), 
    # units=len(data.Issue.value_counts()), 
    units = n_issue,
    kernel_initializer=TruncatedNormal(stddev=config.initializer_range), 
    name='issue')(pooled_output)

In [31]:
product = Dense(
    # units=len(data.Product_label.value_counts()), 
    # units=len(data.Product.value_counts()), 
    units = n_product,
    kernel_initializer=TruncatedNormal(stddev=config.initializer_range), 
    name='product')(pooled_output)

In [32]:
# Then build your model output
outputs = {'issue': issue, 'product': product}

#### model

In [33]:
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

In [34]:
# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
__________________________________________________________________________________________________
pooled_output (Dropout)         (None, 768)          0           bert[0][1]                       
__________________________________________________________________________________________________
issue (Dense)                   (None, 160)          123040      pooled_output[0][0]              
_________________________________________________________________________

## Tune

### train

In [35]:
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

In [36]:
loss = {'issue': CategoricalCrossentropy(from_logits = True), 
        'product': CategoricalCrossentropy(from_logits = True)}

In [37]:
metric = {'issue': CategoricalAccuracy('accuracy'), 
          'product': CategoricalAccuracy('accuracy')}

In [38]:
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [39]:
# Ready output data for the model
y_issue = to_categorical(data['Issue'])
y_product = to_categorical(data['Product'])

print(len(y_issue))
y_issue

print(len(y_product))

47991
47991


In [40]:
data.head(1)

Unnamed: 0,Consumer complaint narrative,Product,Issue,Issue_label,Product_label
111594,A hospital in XXXX NY ( XXXX ) was trying to c...,7,13,Attempts to collect debt not owed,Debt collection


In [41]:
# Tokenize the input (takes some time)
x = tokenizer(
    text=data['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [42]:
# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'issue': y_issue, 'product': y_product},
    validation_split=0.2,
    batch_size=64,
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### evaluate

In [43]:
# Ready test data
test_y_issue = to_categorical(data_test['Issue'])
test_y_product = to_categorical(data_test['Product'])
test_x = tokenizer(
    text=data_test['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [44]:
# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'issue': test_y_issue, 'product': test_y_product}
)

