In [None]:
!pip install simpletransformers

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt

from simpletransformers.classification import ClassificationModel, ClassificationArgs

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight


In [None]:
# Language: 'German', 'Italian', 'All', etc.
language = 'German'

In [None]:
# Are you using Kaggle or another GPU?
gpu_available = True

In [None]:
# Ensures that tweet and user ids do not appear in scientific notation
pd.options.display.float_format = '{:.0f}'.format

In [None]:
train = pd.read_csv('/kaggle/input/twisty-samples/twisty_train.csv', index_col=0)
if language !='All':
    train = train[train['language'] == language]
train

In [None]:
test = pd.read_csv('/kaggle/input/twisty-samples/twisty_test.csv', index_col=0)
if language !='All':
    test = test[test['language'] == language]
test

# Format Dataframe for Simple Transformers

Simple transformers wants the training and testing data to have a dataframe with two columns: 'text' and 'labels'. Text is the NLP twitter data and labels is the number encoding corresponding to the MBTI class.

In [None]:
mbti_num_encoding = {
    'ISTJ':0, 'ISFJ':1, 'INFJ':2, 'INTJ':3,
    'ISTP':4, 'ISFP':5, 'INFP':6, 'INTP':7,
    'ESTP':8, 'ESFP':9, 'ENFP':10, 'ENTP':11,
    'ESTJ':12, 'ESFJ':13, 'ENFJ':14, 'ENTJ':15}

In [None]:
train['labels'] = train['mbti'].apply(lambda x: mbti_num_encoding[x])
test['labels'] = test['mbti'].apply(lambda x: mbti_num_encoding[x])

In [None]:
train = train[['twitter_text', 'labels']].rename(columns={'twitter_text':'text'})
test = test[['twitter_text', 'labels']].rename(columns={'twitter_text':'text'})

In [None]:
train['labels'].value_counts(normalize=True).plot.barh()

plt.title('Distribution of MBTI Train')
plt.xlabel('Percentage')
plt.ylabel('MBTI');

In [None]:
test['labels'].value_counts(normalize=True).plot.barh()

plt.title('Distribution of MBTI Test')
plt.xlabel('Percentage')
plt.ylabel('MBTI');

# **BERT Model**

In [None]:
weights_array = compute_class_weight('balanced', classes=np.arange(16), y=train['labels'])
weights_array

In [None]:

def BERT_model(train_df, epochs, bert_model="bert-base-german-cased"):
    
    model_args = ClassificationArgs(
        num_train_epochs=epochs,
        learning_rate = 1e-4,
        
        # Twitter sequence length is less than 64 tokens
        # https://peltarion.com/knowledge-center/documentation/cheat-sheets/bert---text-classification-/-cheat-sheet
        train_batch_size = 64,
        overwrite_output_dir=True
    )
    
    model = ClassificationModel(
        'bert', 
        bert_model, 
        use_cuda=gpu_available, #Set to true if using kaggle GPU
        num_labels=16, 
        weight=list(weights_array), 
        args=model_args
    )
    
    model.train_model(train_df)
    #result, model_outputs, wrong_predictions = model.eval_model(test_df, acc=accuracy_score)
    #print(result['acc'])
    return model

**Multilingual: "bert-base-multilingual-cased"**

**German (DE): "bert-base-german-cased" https://huggingface.co/bert-base-german-cased**

**Spanish (ES): "dccuchile/bert-base-spanish-wwm-cased" https://huggingface.co/dccuchile/bert-base-spanish-wwm-cased**

**Italian (IT): "dbmdz/bert-base-italian-cased" https://huggingface.co/dbmdz/bert-base-italian-cased**

**Dutch (NL): "GroNLP/bert-base-dutch-cased" https://huggingface.co/GroNLP/bert-base-dutch-cased**


In [None]:
all_bert_models = {'All': "bert-base-multilingual-cased",
                  'German': "bert-base-german-cased",
                  'Spanish': "dccuchile/bert-base-spanish-wwm-cased",
                  'Italian': "dbmdz/bert-base-italian-cased",
                  'Dutch': "GroNLP/bert-base-dutch-cased"}

In [None]:
# all_bert_models[language]
bert = BERT_model(train, epochs=5, bert_model="bert-base-multilingual-cased")

# Model and Error Analysis

In [None]:
def mbti_accuracy(y_true, y_pred):
    # Measures accuracy for mbti classification across 5 accuracy metrics:
    # one match, two matches, three matches, perfect match, average match
    
    # Average match is number of letters match / 4
    
    # Comparing 'ENFJ' as the true class and 'ENFP' as the predicted class,
    # this function returns...
    # [1, 1, 1, 0, 0.75]
     
    # Get index of highest softmax/probability output in y_pred
    # y_pred_index = np.argmax(y_pred, axis=1)
    
    # Use the index to identify the corresponding mbti class
    mbti_num_encoding_list = list(mbti_num_encoding)
    y_true_mbti = [mbti_num_encoding_list[idx] for idx in y_true]
    y_pred_mbti = [mbti_num_encoding_list[idx] for idx in y_pred]
    
    one_match = []
    two_matches = []
    three_matches = []
    perfect_match = []
    
    # Perform mbti accuracy measurements
    sum_num_matches = 0
    for i in np.arange(len(y_true_mbti)):
        num_letter_matches = len(set(y_true_mbti[i]) & set(y_pred_mbti[i]))
        
        # At least 1 letter match
        if num_letter_matches == 1:
            one_match += [True]
            two_matches += [False]
            three_matches += [False]
            perfect_match += [False]
            
        # At least 2 letter matches
        elif num_letter_matches == 2:
            one_match += [True]
            two_matches += [True]
            three_matches += [False]
            perfect_match += [False]
            
        # At least 3 letter matches
        elif num_letter_matches == 3:
            one_match += [True]
            two_matches += [True]
            three_matches += [True]
            perfect_match += [False]
           
        # Perfect match
        else:
            one_match += [True]
            two_matches += [True]
            three_matches += [True]
            perfect_match += [True]
        
    # Average/partial matches
        sum_num_matches += num_letter_matches
    avg_num_matches = sum_num_matches/(len(y_true_mbti)*4)*100
    
    return np.round([np.mean(one_match)*100, 
                     np.mean(two_matches)*100, 
                     np.mean(three_matches)*100, 
                     np.mean(perfect_match)*100, 
                     avg_num_matches], 
                    2)


In [None]:
# Kaggle has been having issues running this
#train_predictions, train_raw_outputs = bert_german.predict(train['text'])

#test_predictions, test_raw_outputs = bert_german.predict(test['text'])

train_result, train_model_outputs, train_wrong_predictions = bert.eval_model(train)

test_result, test_model_outputs, test_wrong_predictions = bert.eval_model(test)


from scipy.special import softmax

train_predictions = np.argmax(softmax(train_model_outputs, axis=1), axis=1)
test_predictions = np.argmax(softmax(test_model_outputs, axis=1), axis=1)

In [None]:
# Example title: 'BERT-Base-German-Cased'
bert_model_type = 'BERT-Base-Multilingual-'+language+'-Cased'

training_acc_metrics = np.append(np.array(['Train', language, bert_model_type, train.shape[0]]), 
                                 mbti_accuracy(train['labels'], train_predictions))
testing_acc_metrics = np.append(np.array(['Test', language, bert_model_type, test.shape[0]]),
                                mbti_accuracy(test['labels'], test_predictions))

acc_metrics = np.vstack((training_acc_metrics, testing_acc_metrics))

acc_metrics_summary = pd.DataFrame(acc_metrics, columns=['Data',
                                                         'Language',
                                                         'Model',
                                                         'Number of Samples',
                                                         'At Least 1 Match', 
                                                         'At Least 2 Matches', 
                                                         'At Least 3 Matches', 
                                                         'Perfect Match', 
                                                         'Average Match'])
acc_metrics_summary

In [None]:
acc_metrics_summary.to_csv('bert_multilingual_'+language.lower()+'_summary.csv')

In [None]:
cf_matrix = confusion_matrix(train['labels'], train_predictions, normalize='true')


plt.figure(figsize=(12, 12))
sns.heatmap(cf_matrix, annot=True, xticklabels=list(mbti_num_encoding), yticklabels=list(mbti_num_encoding), vmin=0, vmax=1)
plt.title("MBTI Classifications of Multilingual "+language+" BERT Train")
plt.ylabel("Actual MBTI Classifications")
plt.xlabel("Predicted MBTI Classifications");

#plt.savefig('bert_german_train_confusion_matrix.png', transparent=True)

In [None]:
cf_matrix = confusion_matrix(test['labels'], test_predictions, normalize='true')


plt.figure(figsize=(12, 12))
sns.heatmap(cf_matrix, annot=True, xticklabels=list(mbti_num_encoding), yticklabels=list(mbti_num_encoding), vmin=0, vmax=1)
plt.title("MBTI Classifications of Multilingual "+language+" BERT Test")
plt.ylabel("Actual MBTI Classifications")
plt.xlabel("Predicted MBTI Classifications");