In [1]:
import gensim
import pandas as pd
import numpy as np
import seaborn as sns
import mpu.ml

from nltk.tokenize import TweetTokenizer
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.losses import CategoricalCrossentropy

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

In [2]:
# Ensures that tweet and user ids do not appear in scientific notation
pd.options.display.float_format = '{:.0f}'.format

# Train and Test Data

In [9]:
train = pd.read_csv('../NLP_MBTI_Classification/twisty_train.csv', index_col=0)
test = pd.read_csv('../NLP_MBTI_Classification/twisty_test.csv', index_col=0)

In [10]:
mbti_num_encoding = {
    'ISTJ':0, 'ISFJ':1, 'INFJ':2, 'INTJ':3,
    'ISTP':4, 'ISFP':5, 'INFP':6, 'INTP':7,
    'ESTP':8, 'ESFP':9, 'ENFP':10, 'ENTP':11,
    'ESTJ':12, 'ESFJ':13, 'ENFJ':14, 'ENTJ':15}

In [11]:
train['mbti_encoding'] = train['mbti'].apply(lambda x: mbti_num_encoding[x])
test['mbti_encoding'] = test['mbti'].apply(lambda x: mbti_num_encoding[x])

In [12]:
train = train[['twitter_text', 'mbti_encoding']]
train

Unnamed: 0,twitter_text,mbti_encoding
14222,@alicedeee Ich könnte der stundenlang zuhören!...,1
14668,@Moaxi @KatrinaJulie kann ich immer noch sehen...,10
889,@GerhardMaier fand ich auch damals. Klappt das...,14
7705,38 qm für 700 warm ... ich muss verrückt sein ...,10
1585,@dilettiert Willkommen in unserer Welt. Liebe ...,7
...,...,...
13586,accorsi la deve smettere,7
8513,io vado col finale\nlo faccio\nchiudo alle 4 m...,6
1371,L'evoluzione dell'Universo in una simulazione ...,15
4695,E dopo essermi addormentata fra le tue braccia...,13


In [13]:
test = test[['twitter_text', 'mbti_encoding']]
test

Unnamed: 0,twitter_text,mbti_encoding
8856,Wisst ihr was das beste an #ibes ist? Dass sie...,11
15218,Ich hab Connis 7 Minuten letztes Jahr auch geh...,10
3635,"@fat_jacK47 ja, aber ich wärs nich :D",10
1065,@Wally44 danke. Ist runtergeladen :),12
1588,@Patienti_A Schlaf gut,4
...,...,...
13638,quel momento in cui stai pedalando tranquillam...,11
18491,uomo perfetto????? FA SCHIFOOOOOOOOOOOOOOOOOOO...,13
6374,"@amerutan non ti allarmare, sto accompagnando ...",3
12934,- insegnarle l'italiano e avevo tipo otto anni...,2


# Creating Multilingual Word Embedding Matrix

In [14]:
max_word_limit = 50000 # Limit due to kernal memory constraint
model1 = gensim.models.KeyedVectors.load_word2vec_format('wiki.de.align.vec', limit=max_word_limit)
model2 = gensim.models.KeyedVectors.load_word2vec_format('wiki.es.align.vec', limit=max_word_limit)
model3 = gensim.models.KeyedVectors.load_word2vec_format('wiki.it.align.vec', limit=max_word_limit)
model4 = gensim.models.KeyedVectors.load_word2vec_format('wiki.nl.align.vec', limit=max_word_limit)

In [15]:
def wordvec_embedding_matrix(model):
    # initialize embedding matrix and word-to-id map:
    embedding_matrix = np.zeros((max_word_limit + 1, 300))       
    vocab_dict = {}

    # build the embedding matrix and the word-to-id map:
    for i, word in enumerate(model.vocab.keys()):
        embedding_vector = model[word]
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            vocab_dict[word] = i
    
    return (embedding_matrix, vocab_dict)

In [16]:
em1, vd1 = wordvec_embedding_matrix(model1)
em2, vd2 = wordvec_embedding_matrix(model2)
em3, vd3 = wordvec_embedding_matrix(model3)
em4, vd4 = wordvec_embedding_matrix(model4)

embedding_matrix = np.vstack([em1, em2, em3, em4])

In [17]:
vocab_dict = {**vd1, **vd2, **vd3, **vd4}

# Tokenize Text

In [18]:
# Split data into twitter text and mbti number encoding
X_train = train['twitter_text']
y_train = train['mbti_encoding']

X_test = test['twitter_text']
y_test = test['mbti_encoding']

# Convert number encoding to one hot vector
#import mpu.ml
y_train = np.array(mpu.ml.indices2one_hot(y_train, nb_classes=16))
y_test = np.array(mpu.ml.indices2one_hot(y_test, nb_classes=16))

In [19]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

# Due to computational limitations, MAX_SEQUENCE_LENGTH has already been precalculated
MAX_SEQUENCE_LENGTH = 52
print("Max token length:", MAX_SEQUENCE_LENGTH)

Max token length: 52


In [20]:
def sents_to_ids(sentences):
    """
    converting a list of strings to a list of lists of word ids
    """
    max_length = MAX_SEQUENCE_LENGTH
    text_ids = np.zeros((1, max_length), dtype=int)
    for sentence in sentences:
        example = []
        for word in tokenizer.tokenize(sentence):
            if word in vocab_dict.keys():
                example.append(vocab_dict[word])
            #else:
            #    example.append(0)

        example = np.pad(example, (0, max_length-len(example)))
        text_ids = np.vstack((text_ids, example))
    
    text_ids = np.delete(text_ids, 0, axis=0)

    return text_ids

X_train = sents_to_ids(X_train)
X_test = sents_to_ids(X_test)

# CNN Model

In [21]:
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

weights_array = compute_class_weight('balanced', 
                       classes=np.arange(16), 
                       y=np.argmax(y_train, axis=1))
weights = dict(zip(np.arange(16), weights_array))

def cnn_model(optimizer='adam', epochs_input=30, batch_size_input=32, under_represented_weighting=False):
    
    # CNN Model Architecture
    tf_model = tf.keras.Sequential()
    tf_model.add(embedding_layer)
    tf_model.add(tf.keras.layers.Conv1D(
                filters=10, 
                kernel_size=3, 
                strides=1, 
                padding='same', 
                activation='relu', 
                use_bias=True,
                kernel_initializer='glorot_uniform', 
                bias_initializer='zeros')) 
    tf_model.add(tf.keras.layers.GlobalMaxPooling1D())
    tf_model.add(Dense(100, activation='relu'))
    tf_model.add(Dense(16, activation='sigmoid'))
    
    tf_model.compile(optimizer=optimizer, loss=CategoricalCrossentropy(), metrics=['accuracy'])
    
    if under_represented_weighting != False:
        # Class weight helps to tell the model to "pay more attention" to samples from an under-represented class.
        # Reduces training accuracy but prevents model just predicting the most popular mbti classification
        # Average accuracy (5th mbti metric below) stays constant with or without the weighting

        #from sklearn.utils.class_weight import compute_class_weight
        #weights_array = compute_class_weight('balanced', 
        #                       classes=np.arange(16), 
        #                       y=np.argmax(y_train, axis=1))
        #weights = dict(zip(np.arange(16), weights_array))
        tf_model.fit(X_train, y_train, 
                           validation_data=(X_test, y_test), 
                           class_weight=under_represented_weighting,
                           epochs=epochs_input, 
                           batch_size=batch_size_input)
        
    else:
        tf_model.fit(X_train, y_train, 
                           validation_data=(X_test, y_test), 
                           epochs=epochs_input, 
                           batch_size=batch_size_input)
        
    return tf_model

In [37]:
def mbti_accuracy(y_true, y_pred):
    # Measures accuracy for mbti classification across 5 accuracy metrics:
    # one match, two matches, three matches, perfect match, average match
    
    # Average match is number of letters match / 4
    
    # Comparing 'ENFJ' as the true class and 'ENFP' as the predicted class,
    # this function returns...
    # [1, 1, 1, 0, 0.75]
     
    # Get index from one hot encoding of y_true
    # Get index of highest softmax/probability output in y_pred
    y_true_index = np.argmax(y_true, axis=1)
    y_pred_index = np.argmax(y_pred, axis=1)
    
    # Use the index to identify the corresponding mbti class
    mbti_num_encoding_list = list(mbti_num_encoding)
    y_true_mbti = [mbti_num_encoding_list[idx] for idx in y_true_index]
    y_pred_mbti = [mbti_num_encoding_list[idx] for idx in y_pred_index]
    
    one_match = []
    two_matches = []
    three_matches = []
    perfect_match = []
    
    # Perform mbti accuracy measurements
    sum_num_matches = 0
    for i in np.arange(len(y_true_mbti)):
        num_letter_matches = len(set(y_true_mbti[i]) & set(y_pred_mbti[i]))
        
        # At least 1 letter match
        if num_letter_matches == 1:
            one_match += [True]
            two_matches += [False]
            three_matches += [False]
            perfect_match += [False]
            
        # At least 2 letter matches
        elif num_letter_matches == 2:
            one_match += [True]
            two_matches += [True]
            three_matches += [False]
            perfect_match += [False]
            
        # At least 3 letter matches
        elif num_letter_matches == 3:
            one_match += [True]
            two_matches += [True]
            three_matches += [True]
            perfect_match += [False]
           
        # Perfect match
        else:
            one_match += [True]
            two_matches += [True]
            three_matches += [True]
            perfect_match += [True]
        
    # Average/partial matches
        sum_num_matches += num_letter_matches
    avg_num_matches = sum_num_matches/(len(y_true_mbti)*4)*100
    
    return np.round([np.mean(one_match)*100, 
                     np.mean(two_matches)*100, 
                     np.mean(three_matches)*100, 
                     np.mean(perfect_match)*100, 
                     avg_num_matches], 
                    2)

# Hyperparameters

In [58]:
epoch = [30, 40, 50]
batch = [128, 256, 512]

combinations = np.array(np.meshgrid(epoch, batch)).T.reshape(-1, 2)
combinations

array([[ 30, 128],
       [ 30, 256],
       [ 30, 512],
       [ 40, 128],
       [ 40, 256],
       [ 40, 512],
       [ 50, 128],
       [ 50, 256],
       [ 50, 512]])

In [59]:
avg_acc_train = []
avg_acc_test = []
model_loss_train = []
model_loss_test = []

cce = tf.keras.losses.CategoricalCrossentropy()

for combo in combinations:
    cnn = cnn_model(epochs_input=combo[0], batch_size_input=combo[1], under_represented_weighting=weights)
    model_output_train = cnn.predict(X_train)
    model_output_test = cnn.predict(X_test)

    avg_acc_train += [mbti_accuracy(y_train, model_output_train)[4]]
    avg_acc_test += [mbti_accuracy(y_test, model_output_test)[4]]

    
    model_loss_train += [cce(y_train, model_output_train).numpy()]
    model_loss_test += [cce(y_test, model_output_test).numpy()]

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30


Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40


Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40


Epoch 40/40
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50


Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [60]:
hyperparameter_results = pd.DataFrame(combinations, columns=['epoch', 'batch'])
hyperparameter_results['avg_acc_train'] = avg_acc_train
hyperparameter_results['avg_acc_test'] = avg_acc_test
hyperparameter_results['model_loss_train'] = model_loss_train
hyperparameter_results['model_loss_test'] = model_loss_test
hyperparameter_results

Unnamed: 0,epoch,batch,avg_acc_train,avg_acc_test,model_loss_train,model_loss_test
0,30,128,52,49,3,3
1,30,256,54,51,3,3
2,30,512,51,49,3,3
3,40,128,53,50,3,3
4,40,256,52,49,3,3
5,40,512,54,51,3,3
6,50,128,54,50,3,3
7,50,256,53,50,3,3
8,50,512,51,48,3,3


In [62]:
hyperparameter_results.to_csv('../NLP_MBTI_Classification/results_summary/cnn_hyperparameter_tuning.csv')