# Introvert vs Extrovert Model

Dataset : [Link](https://www.kaggle.com/datasnaek/mbti-type)

Tutorial : [Gabriel Atkin](https://www.youtube.com/watch?v=s3g0MJcJZyA&list=PLFMqiVagrzLKQ4a37Jj87dl1ccK2RNzG-&index=19&ab_channel=GabrielAtkin)

## Importing Modules

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

import tensorflow as tf
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv('dataset/mbti.csv')

In [3]:
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


## Preprocessing

In [5]:
df.type.unique()

array(['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'],
      dtype=object)

In [6]:
def preprocess_inputs(data):
    texts = data['posts'].copy()
    labels = data['type'].copy()

    # Process text data
    stop_words = stopwords.words('english')

    texts = [text.lower() for text in texts]
    texts = [text.split() for text in texts]
    texts = [[word.strip() for word in text] for text in texts]
    texts = [[word for word in text if word not in stop_words] for text in texts]

    vocab_length = 10000

    tokenizer = Tokenizer(num_words=vocab_length)
    tokenizer.fit_on_texts(texts)

    texts = tokenizer.texts_to_sequences(texts)

    max_seq_len = np.max([len(text) for text in texts])

    texts = pad_sequences(texts, maxlen=max_seq_len, padding='post')

    # Process label
    label_values = [
        'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'
    ]

    label_mapping = {label: np.int(label[0] == 'E') for label in label_values}

    labels = labels.replace(label_mapping)
    labels = np.array(labels)

    return texts, labels, max_seq_len, vocab_length, label_mapping

In [7]:
texts, labels, max_seq_length, vocab_length, label_mapping = preprocess_inputs(df)

In [8]:
print("Text sequences:", texts.shape)
print("\nLabels:", labels.shape)
print("\nMax Sequence Length", max_seq_length)
print("\nVocab length:", vocab_length)
print("\nLabel mapping:", label_mapping)

Text sequences: (8675, 859)

Labels: (8675,)

Max Sequence Length 859

Vocab length: 10000

Label mapping: {'INFJ': 0, 'ENTP': 1, 'INTP': 0, 'INTJ': 0, 'ENTJ': 1, 'ENFJ': 1, 'INFP': 0, 'ENFP': 1, 'ISFP': 0, 'ISTP': 0, 'ISFJ': 0, 'ISTJ': 0, 'ESTP': 1, 'ESFP': 1, 'ESTJ': 1, 'ESFJ': 1}


In [9]:
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, train_size=0.7, random_state=123)

## Training

In [10]:
embedding_dim = 512

inputs = tf.keras.Input(shape=(max_seq_length,))

embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=embedding_dim,
    input_length=max_seq_length
)(inputs)

gru = tf.keras.layers.Bidirectional(
      tf.keras.layers.GRU(
          units=256,
          return_sequences=True
      )
    )(embedding)

flatten = tf.keras.layers.Flatten()(gru)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)

model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

history = model.fit(
    texts_train,
    labels_train,
    validation_split=0.2,
    batch_size=32,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Results

In [11]:
model.evaluate(texts_test, labels_test)



[0.7375852465629578, 0.8075297474861145, 0.804264485836029]

In [12]:
model.save('introver_extrovert_model')

INFO:tensorflow:Assets written to: introver_extrovert_model\assets
INFO:tensorflow:Assets written to: introver_extrovert_model\assets
