In [20]:
# go through this first: https://www.tensorflow.org/text/tutorials/classify_text_with_bert#export_for_inference

import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from helper import remove_unused_columns, transform_profession, preprocess_text
import transformers
import torch
import re
import numpy as np
from tensorflow.python.keras import layers, models
from tqdm import tqdm

tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import TaggedDocument

tf.get_logger().setLevel('ERROR')

## Clean up your data

In [21]:
# reading our files and
file = '../excel-data/f-linkedin-profile.csv'
data_csv = pd.read_csv(file)
data_top = remove_unused_columns(data_csv)[0:13]

# -------- Get our columns into lists --------

headline_categories = list(data_csv['Industry'])


def encode_labels(headline) -> ([int], dict):
    categories_encode = {}
    enc_values = []

    for i, categories in enumerate(list(data_csv['Industry'])):
        if categories not in categories_encode:
            categories_encode[categories] = i

    for item in headline:
        num = categories_encode[item] if item in categories_encode else ''
        enc_values.append(num)

    return enc_values, categories_encode


profession = transform_profession(data_top, data=data_csv)
category_list = data_csv['Headline']
category_id = [i for i in range(len(category_list))]
encoded_labels, categories_label = encode_labels(headline_categories)

# Initialize our columns into a dataframe
dtf = pd.DataFrame()
dtf['category_id'] = category_id
dtf['categories'] = headline_categories
dtf['profession'] = profession
dtf['categories_encode'] = encoded_labels

# Clean your data set first remove unwanted words like: "I", "me", "you"
list_stop_of_words = stopwords.words('english')
dtf['clean_text_profession'] = dtf['profession'].apply(
    lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=list_stop_of_words))

dtf.head()

Unnamed: 0,category_id,categories,profession,categories_encode,clean_text_profession
0,0,Machine Learning,"Director of Data Science, Machine Learning at ...",0,director data science machine learning walmart...
1,1,Machine Learning,Machine learning on Encrypted data Engineer,0,machine learning encrypted data engineer
2,2,Machine Learning,Machine Learning Research Scientist - Deep Lea...,0,machine learning research scientist deep learning
3,3,Machine Learning,Principal (Manager) R&D Data Scientist,0,principal manager rd data scientist
4,4,Machine Learning,"Vice President of Machine Learning, Merchandis...",0,vice president machine learning merchandising ...


## Split data set

In [22]:
## split dataset
X_train, X_val, y_train, y_val = train_test_split(dtf['clean_text_profession'], dtf['categories_encode'], test_size=0.3)

## Using BERT

In [23]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

nlp = transformers.TFBertModel.from_pretrained('bert-base-uncased')

In [26]:
encoded_data_train = tokenizer.encode_plus(
    X_train,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.encode_plus(
    X_val,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

# input_ids_train = encoded_data_train['input_ids']
# attention_masks_train = encoded_data_train['attention_mask']
# labels_train = torch.tensor(X_train)
#
# input_ids_val = encoded_data_val['input_ids']
# attention_masks_val = encoded_data_val['attention_mask']
# labels_val = torch.tensor(X_val)
#
#
# dataset_train = torch.Tensor(input_ids_train, attention_masks_train, labels_train)
# dataset_val = torch.Tensor(input_ids_val, attention_masks_val, labels_val)
#
# print(len(dataset_train), len(dataset_val))

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

## Compiling the model

In [None]:
corpus = dtf['clean_text_profession']
maxlen = 50

maxqnans = np.int((maxlen - 20) / 2)
corpus_tokenized = ["[CLS] " +
                    " ".join(
                        tokenizer.tokenize(re.sub(r'[^\w\s]+|\n', '', str(txt).lower().strip()))[:maxqnans]) + " [SEP] "
                    for txt in corpus]

## generate masks
masks = [[1] * len(txt.split(" ")) + [0] * (maxlen - len(
    txt.split(" "))) for txt in corpus_tokenized]

## padding
txt2seq = [txt + " [PAD]" * (maxlen - len(txt.split(" "))) if len(txt.split(" ")) != maxlen else txt for txt in
           corpus_tokenized]

## generate idx
idx = [tokenizer.encode(seq.split(" ")) for seq in txt2seq]

## generate segments
segments = []
for seq in txt2seq:
    temp, i = [], 0
    for token in seq.split(" "):
        temp.append(i)
        if token == "[SEP]":
            i += 1
    segments.append(temp)
## feature matrix
X_train = [np.asarray(idx, dtype='int32'),
           np.asarray(masks, dtype='int32'),
           np.asarray(segments, dtype='int32')]

i = 0
print("txt: ", dtf['clean_text_profession'].iloc[0])
print("tokenized:", [tokenizer.convert_ids_to_tokens(idx) for idx in X_train[0][i].tolist()])
print("idx: ", X_train[0][i])
print("mask: ", X_train[1][i])
print("segment: ", X_train[2][i])

In [None]:
## inputs
idx = layers.Input(50, dtype="int32", name="input_idx")
masks = layers.Input(50, dtype="int32", name="input_masks")

## pre-trained bert with config
config = transformers.DistilBertConfig(dropout=0.2,
                                       attention_dropout=0.2)
config.output_hidden_states = False
nlp = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

bert_out = nlp(idx, attention_mask=masks)[0]

## fine-tuning
x = layers.GlobalAveragePooling1D()(bert_out)
x = layers.Dense(64, activation="relu")(x)
y_out = layers.Dense(len(np.unique(y_train)), activation='softmax')(x)

## compile
model = models.Model([idx, masks], y_out)

for layer in model.layers[:3]:
    layer.trainable = False

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
## encode y
X_test = dtf_test['clean_text_profession']

dic_y_mapping = {n: label for n, label in enumerate(np.unique(y_train))}
inverse_dic = {v: k for k, v in dic_y_mapping.items()}

y_train = np.array([inverse_dic[y] for y in y_train])

# ## train
training = model.fit(x=X_train, y=y_train, batch_size=64, epochs=1, shuffle=True, verbose=1)

## test
predicted_prob = model.predict(X_test)
predicted = [dic_y_mapping[np.argmax(pred)] for pred in predicted_prob]