In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
device

'cuda'

In [3]:
import transformers

# set to only report critical errors to avoid excessing logging
transformers.utils.logging.set_verbosity(50)

In [4]:
from nlpsig_networks.scripts.fine_tune_bert_classification import (
    fine_tune_transformer_average_seed,
)

In [5]:
output_dir = "client_talk_type_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## AnnoMI

In [6]:
%run ../load_anno_mi.py

In [7]:
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-08-02 00:00:13
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-08-02 00:00:24
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-08-02 00:00:25
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-08-02 00:00:34
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-08-02 00:00:34


In [8]:
with open("../anno_mi_sbert.pkl", "rb") as f:
    sbert_embeddings = pickle.load(f)
    
sbert_embeddings.shape

(13551, 384)

# Baseline: Fine-tune BERT for classification

In [9]:
num_epochs = 10
seeds = [0, 1, 12, 123, 1234]
validation_metric = "f1"

In [10]:
bert_classifier = fine_tune_transformer_average_seed(num_epochs=num_epochs,
                                                     pretrained_model_name="bert-base-uncased",
                                                     df=anno_mi,
                                                     feature_name="utterance_text",
                                                     label_column="client_talk_type",
                                                     seeds=seeds,
                                                     path_indices=client_index,
                                                     k_fold=True,
                                                     validation_metric=validation_metric,
                                                     results_output=f"{output_dir}/bert_classifier.csv",
                                                     device=device,
                                                     verbose=False)

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.775363,0.656532,0.421349
2,No log,0.71916,0.69482,0.586459
3,No log,0.731823,0.71509,0.596493
4,No log,0.799211,0.699887,0.605044
5,No log,0.872612,0.702703,0.582237
6,No log,0.879425,0.703266,0.593894
7,No log,0.968623,0.692005,0.592446
8,No log,0.997447,0.70045,0.6055
9,0.443300,1.03884,0.688626,0.608136
10,0.443300,1.027142,0.697635,0.602403


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.769066,0.685248,0.532896
2,No log,0.710111,0.713401,0.570495
3,No log,0.704305,0.705518,0.600574
4,No log,0.76139,0.714527,0.621354
5,No log,0.876779,0.711149,0.611787
6,No log,0.943892,0.70777,0.598088
7,No log,0.913352,0.698761,0.609114
8,No log,1.01066,0.703829,0.618337
9,0.452300,1.01164,0.706644,0.619281
10,0.452300,1.015519,0.705518,0.618907


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.756778,0.699887,0.500651
2,No log,0.72197,0.699887,0.546091
3,No log,0.745412,0.699324,0.608836
4,No log,0.824641,0.692568,0.597947
5,No log,0.884463,0.692568,0.594893
6,No log,0.918904,0.708333,0.603534
7,No log,0.963019,0.690315,0.593202
8,No log,1.029751,0.701014,0.58988
9,0.442700,1.048007,0.686374,0.593203
10,0.442700,1.06348,0.695946,0.58875


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.787482,0.66723,0.428368
2,No log,0.731394,0.70045,0.567022
3,No log,0.715754,0.712838,0.596984
4,No log,0.793788,0.696509,0.602636
5,No log,0.836586,0.703829,0.607234
6,No log,0.843726,0.711712,0.6165
7,No log,0.907098,0.699887,0.60552
8,No log,0.937648,0.707207,0.615101
9,0.472700,0.948614,0.712275,0.618885
10,0.472700,0.985072,0.70777,0.619118


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.759122,0.686937,0.582394
2,No log,0.730432,0.69482,0.538995
3,No log,0.782653,0.692568,0.596732
4,No log,0.834452,0.682432,0.591261
5,No log,0.895496,0.697072,0.583734
6,No log,1.010006,0.67286,0.606064
7,No log,1.004749,0.692005,0.593567
8,No log,1.06658,0.698761,0.598676
9,0.412900,1.103129,0.690315,0.592686
10,0.412900,1.11872,0.690878,0.592919


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.740329,0.693131,0.435579
2,No log,0.74286,0.685248,0.506492
3,No log,0.764579,0.699887,0.583549
4,No log,0.810213,0.710023,0.610313
5,No log,0.841512,0.693131,0.598619
6,No log,0.873862,0.70045,0.598939
7,No log,0.937005,0.70214,0.591669
8,No log,0.995435,0.685811,0.590647
9,0.426200,0.991587,0.701577,0.592223
10,0.426200,1.009651,0.697072,0.594338


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.757511,0.681869,0.409786
2,No log,0.709269,0.713401,0.570042
3,No log,0.742747,0.699887,0.588769
4,No log,0.767454,0.710586,0.613642
5,No log,0.858857,0.697072,0.605366
6,No log,0.90099,0.701577,0.608931
7,No log,0.946663,0.712838,0.613649
8,No log,0.961696,0.712838,0.615944
9,0.448600,0.962769,0.707207,0.60927
10,0.448600,0.984778,0.703266,0.607582


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.76935,0.670045,0.52882
2,No log,0.723611,0.698761,0.557147
3,No log,0.799477,0.665541,0.582877
4,No log,0.862904,0.696509,0.57025
5,No log,0.974476,0.699887,0.559411
6,No log,1.01022,0.683559,0.596311
7,No log,1.01269,0.68018,0.58336
8,No log,1.050731,0.696509,0.592488
9,0.427200,1.05423,0.691441,0.590899
10,0.427200,1.069314,0.688626,0.586907


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.736895,0.698761,0.47437
2,No log,0.737789,0.691441,0.591658
3,No log,0.730409,0.713401,0.5761
4,No log,0.804817,0.718468,0.592758
5,No log,0.868141,0.712275,0.603622
6,No log,0.893193,0.702703,0.599522
7,No log,0.904804,0.708896,0.589638
8,No log,0.944657,0.708896,0.606202
9,0.435600,0.978415,0.701014,0.597278
10,0.435600,0.990704,0.701577,0.602691


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.770141,0.688626,0.539377
2,No log,0.695116,0.708896,0.59438
3,No log,0.759339,0.698198,0.608438
4,No log,0.81885,0.697072,0.606927
5,No log,0.882175,0.696509,0.609622
6,No log,0.967133,0.682432,0.59883
7,No log,0.94688,0.695946,0.598556
8,No log,0.991696,0.69482,0.58506
9,0.410300,1.040671,0.690315,0.60347
10,0.410300,1.04085,0.697072,0.594121


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.743538,0.681869,0.496712
2,No log,0.726642,0.703829,0.58988
3,No log,0.753264,0.70214,0.587072
4,No log,0.8132,0.704955,0.583481
5,No log,0.875303,0.686374,0.57354
6,No log,0.938966,0.685811,0.610041
7,No log,0.993033,0.684122,0.602081
8,No log,0.979818,0.703266,0.581299
9,0.419100,1.012189,0.693694,0.576918
10,0.419100,1.015958,0.690315,0.580583


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.72663,0.698761,0.580958
2,No log,0.715743,0.708333,0.590462
3,No log,0.752106,0.695946,0.608879
4,No log,0.778851,0.721284,0.640899
5,No log,0.814438,0.703266,0.620043
6,No log,0.87113,0.706081,0.617554
7,No log,0.917929,0.711149,0.6278
8,No log,0.947143,0.711149,0.620356
9,0.431800,0.990516,0.710586,0.626613
10,0.431800,0.998498,0.708896,0.619119


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.718995,0.712838,0.573943
2,No log,0.719907,0.701577,0.595042
3,No log,0.854721,0.675113,0.582165
4,No log,0.856177,0.683559,0.59042
5,No log,0.946718,0.697635,0.602062
6,No log,1.018947,0.689189,0.605136
7,No log,0.978791,0.697072,0.587949
8,No log,1.016293,0.692568,0.580674
9,0.413100,1.081566,0.689189,0.580798
10,0.413100,1.104197,0.686937,0.594925


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.736983,0.697635,0.504376
2,No log,0.700938,0.710023,0.602174
3,No log,0.748581,0.699887,0.601193
4,No log,0.787729,0.710586,0.61742
5,No log,0.837268,0.708333,0.623392
6,No log,0.891511,0.692005,0.604812
7,No log,0.917561,0.708896,0.61777
8,No log,0.932074,0.708896,0.614031
9,0.436200,0.968174,0.713964,0.616168
10,0.436200,0.968737,0.709459,0.619601


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.749528,0.673986,0.471999
2,No log,0.711761,0.690315,0.550286
3,No log,0.777733,0.668919,0.599717
4,No log,0.81666,0.689189,0.605581
5,No log,0.88867,0.70214,0.585699
6,No log,0.918782,0.684685,0.601086
7,No log,0.948034,0.682432,0.582646
8,No log,1.004969,0.684122,0.593978
9,0.426600,1.048277,0.686937,0.597479
10,0.426600,1.048054,0.684685,0.586674


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.754903,0.686374,0.469373
2,No log,0.72215,0.697635,0.580185
3,No log,0.767563,0.685248,0.593325
4,No log,0.814738,0.695946,0.596067
5,No log,0.90516,0.690878,0.601724
6,No log,0.920465,0.698198,0.592763
7,No log,1.008068,0.692568,0.592122
8,No log,1.031456,0.676239,0.57203
9,0.421800,1.037871,0.688063,0.582265
10,0.421800,1.058528,0.694257,0.594247


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.791109,0.643581,0.317704
2,No log,0.691052,0.713401,0.580552
3,No log,0.734396,0.711712,0.584554
4,No log,0.745982,0.716216,0.617777
5,No log,0.807207,0.720158,0.627834
6,No log,0.86341,0.714527,0.590913
7,No log,0.910854,0.705518,0.60876
8,No log,0.937369,0.710586,0.620809
9,0.456600,0.952799,0.713401,0.618352
10,0.456600,0.957036,0.716779,0.615919


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.762987,0.688063,0.444394
2,No log,0.716153,0.708896,0.589023
3,No log,0.749961,0.709459,0.614794
4,No log,0.818435,0.692568,0.594665
5,No log,0.950194,0.692005,0.593154
6,No log,0.947329,0.710586,0.592554
7,No log,0.999646,0.68018,0.601485
8,No log,1.0236,0.682995,0.586067
9,0.433300,1.060176,0.680743,0.587018
10,0.433300,1.060741,0.684122,0.584174


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.722781,0.711712,0.575467
2,No log,0.706161,0.709459,0.578425
3,No log,0.740186,0.707207,0.603021
4,No log,0.793758,0.713401,0.622976
5,No log,0.889282,0.695946,0.612584
6,No log,0.965772,0.699324,0.624084
7,No log,0.970942,0.701577,0.618498
8,No log,0.994436,0.711149,0.630058
9,0.420500,1.028079,0.694257,0.613309
10,0.420500,1.024684,0.70777,0.621044


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.767327,0.66723,0.397343
2,No log,0.716282,0.710023,0.592388
3,No log,0.736326,0.70214,0.610028
4,No log,0.830765,0.682995,0.595979
5,No log,0.861066,0.698198,0.603863
6,No log,0.922712,0.698198,0.586854
7,No log,1.065242,0.676239,0.594256
8,No log,1.01468,0.694257,0.601812
9,0.434600,1.049308,0.682995,0.595255
10,0.434600,1.052345,0.698198,0.598212


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.757002,0.692005,0.517857
2,No log,0.724384,0.70214,0.608644
3,No log,0.750733,0.708896,0.603091
4,No log,0.808361,0.708896,0.590647
5,No log,0.875213,0.677928,0.590259
6,No log,0.894116,0.706081,0.600041
7,No log,0.947104,0.69482,0.60879
8,No log,1.007923,0.691441,0.612193
9,0.409800,0.984197,0.706081,0.602364
10,0.409800,0.992513,0.70045,0.603845


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.724343,0.694257,0.578455
2,No log,0.704447,0.698198,0.616653
3,No log,0.73178,0.709459,0.619101
4,No log,0.81972,0.699887,0.577535
5,No log,0.873047,0.698761,0.591042
6,No log,0.892444,0.682995,0.594944
7,No log,0.936287,0.697072,0.607393
8,No log,0.976085,0.697072,0.604803
9,0.429900,1.012991,0.70214,0.608791
10,0.429900,1.025176,0.696509,0.607438


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.744617,0.713401,0.502803
2,No log,0.739818,0.695383,0.601973
3,No log,0.77975,0.678491,0.587582
4,No log,0.853813,0.6875,0.589321
5,No log,0.883214,0.690878,0.581103
6,No log,0.899428,0.679054,0.591749
7,No log,0.939498,0.693131,0.571699
8,No log,1.003025,0.694257,0.590511
9,0.416900,1.021041,0.689752,0.590724
10,0.416900,1.027541,0.683559,0.582721


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.717546,0.706644,0.573562
2,No log,0.715725,0.698198,0.595114
3,No log,0.757315,0.704392,0.610195
4,No log,0.8109,0.711712,0.602509
5,No log,0.812897,0.725225,0.620419
6,No log,0.88183,0.708333,0.615135
7,No log,0.920531,0.703266,0.601994
8,No log,0.941658,0.698198,0.611944
9,0.422400,0.965929,0.699887,0.612588
10,0.422400,0.968127,0.707207,0.618155


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id
[INFO] Setting up TrainingArguments object and saving to `.training_args`.
[INFO] Setting up Trainer object, and saving to `.trainer`.
[INFO] Training model with 109484547 parameters...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.737274,0.695383,0.57214
2,No log,0.72323,0.699324,0.590745
3,No log,0.774364,0.701577,0.594028
4,No log,0.852137,0.705518,0.590349
5,No log,0.889373,0.693694,0.590141
6,No log,0.909303,0.688063,0.570859
7,No log,0.95989,0.690315,0.586734
8,No log,0.971887,0.692005,0.583439
9,0.407900,0.993152,0.689752,0.591994
10,0.407900,1.006553,0.695946,0.592283


[INFO] Training completed!


  0%|          | 0/1345 [00:00<?, ?it/s]

saving the results dataframe to CSV in client_talk_type_output/bert_classifier.csv


In [11]:
bert_classifier

Unnamed: 0,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,seed,k_fold
0,0.701413,0.576948,"[0.8060861120103594, 0.48940998487140697, 0.43...",0.627257,"[0.7396039603960396, 0.6629098360655737, 0.479...",0.557467,"[0.8857007351197533, 0.38788968824940045, 0.39...",0,True
0,0.700372,0.555906,"[0.8034334763948497, 0.5197934595524957, 0.344...",0.630695,"[0.7336860670194003, 0.6103476151980598, 0.548...",0.530554,"[0.8878349537585961, 0.4526378896882494, 0.251...",1,True
0,0.704535,0.582494,"[0.8051948051948052, 0.5374241402562374, 0.404...",0.624158,"[0.7512836311357568, 0.6140215716486903, 0.507...",0.560721,"[0.8674413089874318, 0.47781774580335734, 0.33...",12,True
0,0.710483,0.590863,"[0.8100498158977693, 0.5188437614343214, 0.443...",0.640952,"[0.7454654175802272, 0.6657276995305165, 0.511...",0.567871,"[0.8868864121413327, 0.4250599520383693, 0.391...",123,True
0,0.696952,0.604289,"[0.7971882922332334, 0.5374659400544959, 0.478...",0.612796,"[0.7753866845998655, 0.6222397476340694, 0.440...",0.605297,"[0.8202513635285749, 0.47302158273381295, 0.52...",1234,True


In [12]:
bert_classifier["f1"].mean()

0.5821000015826902

In [13]:
bert_classifier["precision"].mean()

0.6271717229743248

In [14]:
bert_classifier["recall"].mean()

0.5643821405492904

In [15]:
np.stack(bert_classifier["f1_scores"]).mean(axis=0)

array([0.8043905 , 0.52058746, 0.42132205])

In [16]:
np.stack(bert_classifier["precision_scores"]).mean(axis=0)

array([0.74908515, 0.63504929, 0.49738072])

In [17]:
np.stack(bert_classifier["recall_scores"]).mean(axis=0)

array([0.86962295, 0.44328537, 0.3802381 ])