In [1]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf


from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import regex as re
import transformers
from keras import backend as K




from collections import Counter
tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", tf.config.list_physical_devices('GPU'))

##tensorflow-2.16.1	cudnn 8.9 cuda12.3
data=pd.read_csv('dataset_mbti/mbti_1.csv')
data.head()


Num GPUs Available:  []


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
def clean_text(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(data.posts):
        sentence=sentence.lower()
        
        #removing links from text data
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    
        #removing other symbols
        sentence=re.sub('[^0-9a-z]',' ',sentence)
    
        
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text

In [5]:
data.posts = clean_text(data)


  0%|          | 0/8675 [00:00<?, ?it/s]

In [6]:
#balance the dataset
print(data['type'].value_counts())
min_samples, max_samples = 200, 1000

# pre-processing (up-sampling, down-sampling, shuffle)

def resample_data(data, min_samples, max_samples):

    resampled_data = []
    for label, group in data.groupby('type'):
        n_samples = len(group)
        if n_samples < min_samples:

            resampled_group = group.sample(min_samples, replace=True, random_state=42)
        elif n_samples > max_samples:
            
            resampled_group = group.sample(max_samples, random_state=42)
        else:
            
            resampled_group = group
        resampled_data.append(resampled_group)

    
    balanced_df = pd.concat(resampled_data)

    shuffled_df = balanced_df.sample(frac=1).reset_index(drop=True)


    print(shuffled_df['type'].value_counts())

    return shuffled_df

data = resample_data(data, min_samples, max_samples)

print(data.head())


type
INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: count, dtype: int64
type
INFJ    1000
INTP    1000
INTJ    1000
INFP    1000
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ESTP     200
ESFJ     200
ESTJ     200
ISFJ     200
ESFP     200
ENFJ     200
Name: count, dtype: int64
   type                                              posts
0  INFJ  1  would you want to live forever  why why not...
1  ESTP   the description of se  i am not that aware of...
2  INTP   for a few years i believed i was all the awes...
3  INTJ  if you care about them  you should try and rep...
4  ENTP   ah   but i am the fifth element and i have th...


In [7]:
length_col = []
for i in range(len(data)):
    length_col.append(len(data['posts'][i]))
mean_len = np.mean(np.array(length_col))
print(length_col)
print(mean_len)

[9353, 4829, 9306, 5118, 5269, 8891, 3642, 4840, 4305, 9135, 3296, 8132, 5320, 9198, 6043, 8848, 6560, 6783, 6400, 6937, 6392, 7662, 7632, 3424, 7646, 9154, 6675, 6923, 8162, 5956, 9066, 8438, 7468, 9192, 7514, 7952, 7654, 5295, 6636, 5161, 8051, 5792, 7453, 6516, 7538, 3303, 6416, 2833, 6369, 7733, 7457, 4399, 5332, 2095, 9133, 7703, 4004, 7753, 8871, 7991, 4341, 8762, 8887, 9299, 6403, 9382, 9142, 7228, 4914, 5931, 7206, 9405, 6282, 6919, 5800, 9935, 6251, 7451, 7023, 5579, 7786, 7296, 4880, 8383, 7892, 6416, 8968, 4532, 4051, 5386, 8165, 5612, 9440, 7423, 3838, 7332, 6686, 8753, 6020, 7352, 7709, 9192, 7502, 6158, 6729, 8621, 7757, 7311, 3490, 8926, 9198, 6312, 7805, 7986, 3862, 7271, 9308, 4915, 8659, 7304, 4814, 7694, 2082, 7150, 8298, 5684, 8442, 7865, 7677, 5575, 8494, 7033, 6048, 6568, 6811, 3238, 8113, 6912, 8096, 6853, 8112, 8555, 5672, 7716, 6942, 8698, 2303, 8353, 8438, 8913, 6550, 5408, 7968, 4828, 8604, 5722, 6847, 4009, 5700, 6320, 7544, 8745, 8157, 5179, 8135, 5348, 820

In [8]:
from sklearn.model_selection import train_test_split

posts = data['posts'].values
labels =  data['type'].values
train_data, test_data, train_labels, test_labels = train_test_split(posts, labels, random_state=0, test_size=0.2)

print(len(train_data))
print(len(test_data))

6083
1521


In [9]:
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True, add_special_tokens=True, return_tensors="tf")
MAX_LEN = 512
BATCH_SIZE = 128

def tokenize_sentences(sentences, tokenizer, max_seq_len = 512):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                            truncation=True
                    )
        
        tokenized_sentences.append(tokenized_sentence)
        
    return tokenized_sentences


types = np.unique(data.type.values)
print(types)
def get_type_index(str_array):
    return_list = []

    for i in range(len(str_array)):
        return_list.append(list(types).index(str_array[i]))

    return return_list


train_labels_categorized = get_type_index(train_labels)





def tokenize_sentences_chunking(sentences, labels, tokenizer, max_seq_len = 512, overlap = 64):
    tokenized_sentences = []
    new_labels = []
    for i, sentence in enumerate(tqdm(sentences)):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            
                    )
        chunk_size = max_seq_len - overlap
        chunks = [tokenized_sentence[m:m + max_seq_len] for m in range(0, len(tokenized_sentence), chunk_size)]
        overlap_labels = [labels[i] for _ in range(0, len(tokenized_sentence), chunk_size) ]
        
        tokenized_sentences += chunks
        new_labels += overlap_labels

        
    return tokenized_sentences, new_labels


train_labels_categorized = get_type_index(train_labels)
print(Counter(train_labels_categorized))




['ENFJ' 'ENFP' 'ENTJ' 'ENTP' 'ESFJ' 'ESFP' 'ESTJ' 'ESTP' 'INFJ' 'INFP'
 'INTJ' 'INTP' 'ISFJ' 'ISFP' 'ISTJ' 'ISTP']
Counter({10: 817, 8: 810, 11: 793, 9: 781, 1: 556, 3: 555, 15: 269, 13: 201, 2: 176, 7: 166, 14: 165, 4: 164, 6: 160, 0: 160, 12: 158, 5: 152})


In [10]:
from collections import Counter

print(len(train_labels))
print(Counter(train_labels))

6083
Counter({'INTJ': 817, 'INFJ': 810, 'INTP': 793, 'INFP': 781, 'ENFP': 556, 'ENTP': 555, 'ISTP': 269, 'ISFP': 201, 'ENTJ': 176, 'ESTP': 166, 'ISTJ': 165, 'ESFJ': 164, 'ESTJ': 160, 'ENFJ': 160, 'ISFJ': 158, 'ESFP': 152})


In [11]:
# chunking

train_input_ids, train_labels = tokenize_sentences_chunking(train_data,train_labels, tokenizer, MAX_LEN, overlap = 64)
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
train_labels_categorized = get_type_index(train_labels)
one_hot_train_labels = tf.keras.utils.to_categorical(train_labels_categorized, num_classes=16)
one_hot_train_labels = tf.cast(one_hot_train_labels, dtype=tf.float32)

train_dataset = tf.data.Dataset.from_tensor_slices((train_input_ids, one_hot_train_labels))
train_dataset = train_dataset.batch(BATCH_SIZE)





test_input_ids, test_labels = tokenize_sentences_chunking(test_data,test_labels, tokenizer, MAX_LEN, overlap = 64)
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_labels_categorized = get_type_index(test_labels)
one_hot_test_labels = tf.keras.utils.to_categorical(test_labels_categorized, num_classes=16)
one_hot_test_labels = tf.cast(one_hot_test_labels, dtype=tf.float32)


test_dataset = tf.data.Dataset.from_tensor_slices((test_input_ids, one_hot_test_labels))
test_dataset = test_dataset.batch(BATCH_SIZE)

  0%|          | 0/6083 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1744 > 512). Running this sequence through the model will result in indexing errors
2024-04-22 10:26:35.376512: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43604 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:34:00.0, compute capability: 8.6


  0%|          | 0/1521 [00:00<?, ?it/s]

In [12]:
# # truncating 
# one_hot_train_labels = tf.keras.utils.to_categorical(train_labels_categorized, num_classes=16)
# one_hot_train_labels = tf.cast(one_hot_train_labels, dtype=tf.float32)


# train_input_ids = tokenize_sentences(train_data, tokenizer, MAX_LEN)
# train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
# train_dataset = tf.data.Dataset.from_tensor_slices((train_input_ids, one_hot_train_labels))
# train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(10000)

# one_hot_test_labels = tf.keras.utils.to_categorical(test_labels_categorized, num_classes=16)
# one_hot_test_labels = tf.cast(one_hot_test_labels, dtype=tf.float32)
# test_input_ids = tokenize_sentences(test_data, tokenizer, MAX_LEN)
# test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
# test_dataset = tf.data.Dataset.from_tensor_slices((test_input_ids, one_hot_test_labels))
# test_dataset = test_dataset.batch(BATCH_SIZE).shuffle(10000)


In [13]:

print(len(train_input_ids))


22148


In [14]:
print(one_hot_train_labels[1])

tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.], shape=(16,), dtype=float32)


In [15]:
print(train_input_ids)
print(train_dataset)

[[ 101 1996 2364 ... 2128 3080 1998]
 [6963 3892 1998 ... 2242 5905 2021]
 [1999 2026 2166 ... 2428 6235 2033]
 ...
 [4553 2011 3666 ...    0    0    0]
 [ 101 2052 2017 ... 2070 2307 5988]
 [2005 2202 2048 ...    0    0    0]]
<_BatchDataset element_spec=(TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), TensorSpec(shape=(None, 16), dtype=tf.float32, name=None))>


In [16]:
from transformers import TFBertModel

from tensorflow.keras.layers import Dense, Flatten

class BertClassifier_mbti(tf.keras.Model):    
     def __init__(self, bert: TFBertModel, num_classes: int):
          super().__init__()
          self.bert = bert
          
          self.classifier = Dense(num_classes, activation='softmax')
          
          for layer in self.bert.layers:
               layer.trainable = False
       
     def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
          outputs = self.bert(input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              position_ids=position_ids,
                              head_mask=head_mask)
          bert_output = outputs[0]#embedding # [:,0,:]
          #print(bert_output.shape)
          #print(bert_output.shape[0])
          bert_output2 = tf.reshape(bert_output,[tf.shape(bert_output)[0], 512*1024])
          #print(bert_output2.shape)
          output = self.classifier(bert_output2)
          #print(output.shape)
          
          return output
        
        


In [17]:
#import tensorflow_addons as tfa
bert_layer = transformers.TFBertModel.from_pretrained('bert-large-uncased')
for layer in bert_layer.layers:
    layer.trainable = False
learning_rate = 2e-5
model = BertClassifier_mbti(bert_layer, 16)
model.compile(loss='categorical_crossentropy', 
              optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), 
              metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tf.keras.metrics.F1Score()]
                              )
            
    


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [18]:
import wandb
EPOCHES = 20
wandb.login()
wandb.init(project="EECS595-project-mbti", entity="davidmaz")

[34m[1mwandb[0m: Currently logged in as: [33mdavidmaz[0m ([33mdavidmazteam[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Currently logged in as: [33mdavidmaz[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [21]:

from tensorflow.keras.models import load_model
from wandb.keras import WandbMetricsLogger





#model = load_model('BERT_model.keras')  # 加载模型
#tf.config.optimizer.set_experimental_options({"disable_meta_optimizer": True})
model.fit((train_dataset), 
           
          verbose = 1, #progress report
          epochs = EPOCHES, 
          callbacks=[wandb.keras.WandbMetricsLogger()]
          )

Epoch 1/20


I0000 00:00:1713796341.493300  602126 service.cc:145] XLA service 0x14750c0169d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1713796341.493329  602126 service.cc:153]   StreamExecutor device (0): NVIDIA A40, Compute Capability 8.6
2024-04-22 10:32:24.128184: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
W0000 00:00:1713796347.619683  602126 assert_op.cc:38] Ignoring Assert operator bert_classifier_mbti_1/tf_bert_model/bert/embeddings/assert_less/Assert/Assert
2024-04-22 10:32:30.518099: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
2024-04-22 10:32:31.851844: W external/local_xla/xla/service/gpu/triton_autotuner.cc:660] Compiling 47 configs for 2 fusions on a single thread.
I0000 00:00:1713796368.346775  602126 device_compiler.h:188] Compiled cluster using XLA!  This line is logg

[1m173/174[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m1s[0m 2s/step - accuracy: 0.1387 - f1_score: 0.1205 - loss: 2.9768 - precision: 0.2450 - recall: 0.0213

W0000 00:00:1713796653.549492  602126 assert_op.cc:38] Ignoring Assert operator bert_classifier_mbti_1/tf_bert_model/bert/embeddings/assert_less/Assert/Assert
2024-04-22 10:37:35.378198: W external/local_xla/xla/service/gpu/triton_autotuner.cc:660] Compiling 42 configs for 2 fusions on a single thread.


[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.1389 - f1_score: 0.1209 - loss: 2.9755 - precision: 0.2456 - recall: 0.0214

[34m[1mwandb[0m: [32m[41mERROR[0m Unable to log learning rate.


[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m348s[0m 2s/step - accuracy: 0.1391 - f1_score: 0.1212 - loss: 2.9742 - precision: 0.2462 - recall: 0.0215
Epoch 2/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 2s/step - accuracy: 0.4259 - f1_score: 0.4795 - loss: 1.7942 - precision: 0.7799 - recall: 0.1556
Epoch 3/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 2s/step - accuracy: 0.6415 - f1_score: 0.6991 - loss: 1.2408 - precision: 0.9375 - recall: 0.3204
Epoch 4/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 2s/step - accuracy: 0.7265 - f1_score: 0.7925 - loss: 0.9835 - precision: 0.9491 - recall: 0.4707
Epoch 5/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 2s/step - accuracy: 0.7316 - f1_score: 0.8119 - loss: 0.8965 - precision: 0.9129 - recall: 0.5496
Epoch 6/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 2s/step - accuracy: 0.7893 - f1_score: 0.8546 -

<keras.src.callbacks.history.History at 0x14752cc18ed0>

In [None]:
model.save('BERT_model.keras') 


In [None]:
loss, accuracy, f1_score, precision, recall = model.evaluate(test_dataset, verbose = 1)

In [None]:

result_dict = {
    'loss' : loss,
    'accuracy' : accuracy,
    'f1_score' : f1_score,
    'precision' : precision,
    'recall' : recall
}
df = pd.DataFrame(result_dict)
print(df)
df.to_csv('result.csv', index=False)
#df.head()  # >:DataFrame