In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import clear_output
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

!pip install imbalanced-learn
from imblearn.under_sampling import RandomUnderSampler

!pip install gdown
!gdown https://drive.google.com/uc?id=1QmH8BuCwltXgQMbzfLkxKJA636aQdBjN

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading...
From: https://drive.google.com/uc?id=1QmH8BuCwltXgQMbzfLkxKJA636aQdBjN
To: /content/mbti_full_pull.csv
100% 599M/599M [00:07<00:00, 80.2MB/s]


In [9]:
#remove mbti-related text, so we dont classify a type based on how much some mbti string appears inside (not generalised)
#for loop creates all stuff like XXTJ etc, cause some posts contains thing like this

mbti_types = ['ISTJ', 'ISTP', 'ISFJ', 'ISFP', 'INTJ', 'INTP', 'INFJ', 'INFP', 'ESTJ', 'ESTP', 'ESFJ', 'ESFP', 'ENTJ', 'ENTP', 'ENFJ', 'ENFP']
mbti_stopwords = ['fe', 'fi', 'ne', 'ni', 'se', 'si', 'te', 'ti']
for i in mbti_types:
  for mask in range(17):
    tmp = list(i)
    for j in range(4):
      if pow(2, j) & mask != 0:
        tmp[j] = 'X'
    mbti_stopwords.append("".join(tmp))
    mbti_stopwords.append("".join(tmp) + "s")


class_size = 2000# @param
vocab_size = 2000
embedding_dim =  64
train_size = 0.8
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [10]:
df = pd.read_csv("mbti_full_pull.csv").drop(['subreddit'], axis = 1)
df.columns = ['type', 'text']
df['type'] = df['type'].str.upper()
for i in range(16):
  df.loc[df['type'].str.contains(mbti_types[i]), 'type'] = mbti_types[i]
df = df[df['text'].str.len() >= 50]
df = df.reset_index(drop = True)
under = RandomUnderSampler(sampling_strategy = dict(np.minimum(df['type'].value_counts(), class_size)))
df, df['type'] = under.fit_resample(df, df['type'])
stopwords = [i.lower() for i in nltk.corpus.stopwords.words('english') + mbti_stopwords + [chr(i) for i in range(97, 123)]]
df.text = df.text.apply(lambda text: re.sub("\s+", " ", ' '.join([i for i in re.sub("[^9A-Za-z ]", "" , re.sub("\\n", "", re.sub("\s+", " ", re.sub(r'http\S+', '', text.lower())))).split(" ") if i not in stopwords])))
df.loc[df['text'].str.len() >= max_length, 'text'] = df.loc[df['text'].str.len() >= max_length, 'text'].str[:max_length]

df

Unnamed: 0,type,text
0,ENFJ,yknow point id call impossible really mature w...
1,ENFJ,interesting person gaze penetrating speech mea...
2,ENFJ,pagsubok lang yan kahit gaano kahirap ang isan...
3,ENFJ,doesnt matter im hiding body actually help poi...
4,ENFJ,tell calm fuck assure meant sign damn papers w...
...,...,...
31995,ISTP,dont worst enemy per conflicts
31996,ISTP,yep im good english including reading writinge...
31997,ISTP,dont know appealing building part looks fun th...
31998,ISTP,struggle often notice kill conversations sayin...


In [19]:
x = df.text
y = df.type.apply(lambda s: pd.Series(dict(zip(["I/E", "N/S", "T/F", "P/J"], [str.index(i) for i, str in zip(list(s), ["IE", "NS", "TF", "PJ"])]))))

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.4)
x_val, x_test, y_val, y_test = train_test_split(x, y, test_size=0.25)

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

x_train = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=max_length, padding=padding_type, truncating=trunc_type)
x_test = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=max_length, padding=padding_type, truncating=trunc_type)
x_val = pad_sequences(tokenizer.texts_to_sequences(x_val), maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [20]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim), # embedding layer
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, dropout=0.2, recurrent_dropout=0.2)), # LSTM layer
    tf.keras.layers.Dropout(rate=0.2), # dropout layer
    tf.keras.layers.Dense(embedding_dim, activation='relu'), # fully connected layer
    tf.keras.layers.Dense(4, activation='sigmoid') #dropout layer
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 64)          128000    
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 4)                 260       
                                                                 
Total params: 202,564
Trainable params: 202,564
Non-trainable params: 0
________________________________________________

In [21]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC'])
num_epochs = 10
early_stopping_monitor = EarlyStopping(patience=2)
history = model.fit(x_train, y_train, epochs=num_epochs, validation_data=(x_val, y_val), callbacks = [early_stopping_monitor])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
y_pred = pd.DataFrame(model.predict(x_test).round(), columns=["I/E", "N/S", "T/F", "P/J"]).applymap(int)
y_pred

Unnamed: 0,I/E,N/S,T/F,P/J
0,0,0,1,1
1,1,1,1,1
2,1,1,0,1
3,1,1,1,1
4,1,1,0,0
...,...,...,...,...
7995,1,1,1,0
7996,0,1,1,1
7997,1,0,1,1
7998,1,0,0,1


In [36]:
y_test = y_test.reset_index().drop(columns="index")
y_test

Unnamed: 0,I/E,N/S,T/F,P/J
0,0,0,1,1
1,1,0,0,1
2,0,1,0,0
3,0,1,1,0
4,1,1,1,0
...,...,...,...,...
7995,0,1,1,0
7996,1,0,0,0
7997,0,0,0,0
7998,1,0,0,1


In [39]:
(y_pred == y_test).applymap(int).mean(axis=0)

I/E    0.620875
N/S    0.669250
T/F    0.660375
P/J    0.640875
dtype: float64

In [42]:
((y_pred == y_test).applymap(int).sum(axis=1) == 4).mean()

0.217125

In [43]:
x

0        yknow point id call impossible really mature w...
1        interesting person gaze penetrating speech mea...
2        pagsubok lang yan kahit gaano kahirap ang isan...
3        doesnt matter im hiding body actually help poi...
4        tell calm fuck assure meant sign damn papers w...
                               ...                        
31995                      dont worst enemy per conflicts 
31996    yep im good english including reading writinge...
31997    dont know appealing building part looks fun th...
31998    struggle often notice kill conversations sayin...
31999    terribly illogical short sighted reason hate type
Name: text, Length: 32000, dtype: object

In [44]:
y

Unnamed: 0,I/E,N/S,T/F,P/J
0,1,0,1,1
1,1,0,1,1
2,1,0,1,1
3,1,0,1,1
4,1,0,1,1
...,...,...,...,...
31995,0,1,0,0
31996,0,1,0,0
31997,0,1,0,0
31998,0,1,0,0


In [55]:
data = pd.DataFrame(y)
data["text"] = x
data = data[["text", "I/E", "N/S", "T/F", "P/J"]]
data

Unnamed: 0,text,I/E,N/S,T/F,P/J
0,yknow point id call impossible really mature w...,1,0,1,1
1,interesting person gaze penetrating speech mea...,1,0,1,1
2,pagsubok lang yan kahit gaano kahirap ang isan...,1,0,1,1
3,doesnt matter im hiding body actually help poi...,1,0,1,1
4,tell calm fuck assure meant sign damn papers w...,1,0,1,1
...,...,...,...,...,...
31995,dont worst enemy per conflicts,0,1,0,0
31996,yep im good english including reading writinge...,0,1,0,0
31997,dont know appealing building part looks fun th...,0,1,0,0
31998,struggle often notice kill conversations sayin...,0,1,0,0


In [56]:
data.to_csv("mbti.csv", index=False)

In [77]:
!git clone https://github.com/terminalai/webdev-ai
!cp webdev-ai/data/* .
!rm -rf webdev-ai

import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

Cloning into 'webdev-ai'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects:   6% (1/16)[Kremote: Counting objects:  12% (2/16)[Kremote: Counting objects:  18% (3/16)[Kremote: Counting objects:  25% (4/16)[Kremote: Counting objects:  31% (5/16)[Kremote: Counting objects:  37% (6/16)[Kremote: Counting objects:  43% (7/16)[Kremote: Counting objects:  50% (8/16)[Kremote: Counting objects:  56% (9/16)[Kremote: Counting objects:  62% (10/16)[Kremote: Counting objects:  68% (11/16)[Kremote: Counting objects:  75% (12/16)[Kremote: Counting objects:  81% (13/16)[Kremote: Counting objects:  87% (14/16)[Kremote: Counting objects:  93% (15/16)[Kremote: Counting objects: 100% (16/16)[Kremote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 16 (delta 3), reused 12 (delta 3), pack-reused 0[K
Unpacking objects: 100% (16/16), done.


In [78]:
x = data.text
y = data[["I/E", "N/S", "T/F", "P/J"]]

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.4)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.25)

tokenizer = Tokenizer(num_words=2000, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

x_train = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=100, padding='post', truncating='post')
x_test = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=100, padding='post', truncating='post')
x_val = pad_sequences(tokenizer.texts_to_sequences(x_val), maxlen=100, padding='post', truncating='post')

In [79]:
clf = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim), # embedding layer
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, dropout=0.2, recurrent_dropout=0.2)), # LSTM layer
    tf.keras.layers.Dropout(rate=0.2), # dropout layer
    tf.keras.layers.Dense(embedding_dim, activation='relu'), # fully connected layer
    tf.keras.layers.Dense(4, activation='sigmoid') #dropout layer
])
clf.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 64)          128000    
                                                                 
 bidirectional_5 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_10 (Dense)            (None, 64)                8256      
                                                                 
 dense_11 (Dense)            (None, 4)                 260       
                                                                 
Total params: 202,564
Trainable params: 202,564
Non-trainable params: 0
________________________________________________

In [80]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'AUC'])
num_epochs = 10
early_stopping_monitor = EarlyStopping(patience=2)
history = model.fit(x_train, y_train, epochs=num_epochs, validation_data=(x_val, y_val), callbacks = [early_stopping_monitor])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
