In [4]:
import numpy as np
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')


In [5]:
# Load the dataset
data = pd.read_csv('mbti_1.csv')

In [6]:
data

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


## Data Preprocessing

In [7]:
# Make a column for 'I-E'which is Introversion/Extraversion
data['I-E']= data['type'].apply(lambda x: "I" if x[0]=='I' else 'E' )

# Make a column for 'N-S'which is Intuition/Sensing
data['N-S']= data['type'].apply(lambda x: "N" if x[1]=='N' else 'S' )

# Make a column for 'T-F'which is Thinking/Feeling
data['T-F']= data['type'].apply(lambda x: "T" if x[2]=='T' else 'F' )

# Make a column for 'J-P'which is Judging/Perceiving
data['J-P']= data['type'].apply(lambda x: "J" if x[3]=='J' else 'P' )


In [8]:
data

Unnamed: 0,type,posts,I-E,N-S,T-F,J-P
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,I,N,F,J
1,ENTP,'I'm finding the lack of me in these posts ver...,E,N,T,P
2,INTP,'Good one _____ https://www.youtube.com/wat...,I,N,T,P
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",I,N,T,J
4,ENTJ,'You're fired.|||That's another silly misconce...,E,N,T,J
...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,I,S,F,P
8671,ENFP,'So...if this thread already exists someplace ...,E,N,F,P
8672,INTP,'So many questions when i do these things. I ...,I,N,T,P
8673,INFP,'I am very conflicted right now when it comes ...,I,N,F,P


In [9]:
import string
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')  # Download the WordNet dataset if you haven't already


# Cast Folding
def to_lower(text):     
    return text.lower()

def remove_urls(text):
    # Define a regular expression pattern to match URLs starting with 'http' or 'https'
    url_pattern = r'https?://\S+|www\.\S+'

    # Use the re.sub() function to replace all URL matches with an empty string
    text = re.sub(url_pattern, '', text)

    return text


# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

def stopwords(text):
    # Remove stopwords from the list of words
    text = ' '.join(c for c in text.split() if c not in stop_words)
    
    return text

def remove_mbti_labels(text):
    # Define a list of MBTI labels
    mbti_labels = ['infj', 'infp', 'intj', 'intp', 'isfj', 'isfp', 'istj', 'istp', 'enfj', 'enfp', 'entj', 'entp', 'esfj', 'esfp', 'estj', 'estp'] 
    
    # Remove MBTI labels from the list of words
    text = ' '.join(c for c in text.split() if c not in mbti_labels)

    return text

def remove_punct(text):     # Remove Punctuation, artinya dalam setiap baris dicari setiap huruf. if dia bukan punctuation maka akan kembali dijoin. jika iya punctuation akan hilang 
    return ''.join(c for c in text if c not in string.punctuation)
    
def remove_number(text):        # Remove Number
    return ''.join(c for c in text if not c.isdigit())

def to_strip(text):
    return ' '.join(text.split()) # Remove whitespace 


def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    # Lemmatize each word and join them back into a sentence
    text = ' '.join([lemmatizer.lemmatize(word) for word in words])
    
    return text

# Combine all function 
def prepro(text): 
    pre = to_lower(str(text))
    pre = remove_urls(pre)
    pre = remove_punct(pre)
    pre = remove_number(pre)
    pre = to_strip(pre)
    pre = stopwords(pre)
    pre = remove_mbti_labels(pre)
    pre = lemmatize(pre)
    
    return pre 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sylvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sylvi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
data['clean']=data['posts'].apply(prepro)

In [11]:
data['clean']

0       moment sportscenter top ten play prankswhat li...
1       im finding lack post alarmingsex boring positi...
2       good one course say know thats blessing cursed...
3       dear enjoyed conversation day esoteric gabbing...
4       youre firedthats another silly misconception a...
                              ...                        
8670    always think cat fi doms reason website become...
8671    soif thread already exists someplace else heck...
8672    many question thing would take purple pill pic...
8673    conflicted right come wanting child honestly m...
8674    long since personalitycafe although doesnt see...
Name: clean, Length: 8675, dtype: object

## Train Test Split

In [90]:
# Train Test Split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold

X = data[['clean']]
y_I_E = data['I-E']
y_N_S = data['N-S']
y_T_F = data['T-F']
y_J_P = data['J-P']

# Split the data for 'I-E' while ensuring stratification
X_train_I_E, X_test_I_E, y_train_I_E, y_test_I_E = train_test_split(X, y_I_E, test_size=0.2, random_state=42, stratify=y_I_E)

# Split the data for 'N-S' while ensuring stratification
X_train_N_S, X_test_N_S, y_train_N_S, y_test_N_S = train_test_split(X, y_N_S, test_size=0.2, random_state=42, stratify=y_N_S)

# Split the data for 'T-F' while ensuring stratification
X_train_T_F, X_test_T_F, y_train_T_F, y_test_T_F = train_test_split(X, y_T_F, test_size=0.2, random_state=42, stratify=y_T_F)

# Split the data for 'J-P' while ensuring stratification
X_train_J_P, X_test_J_P, y_train_J_P, y_test_J_P = train_test_split(X, y_J_P, test_size=0.2, random_state=42, stratify=y_J_P)



## I-E (Vectorizer and modelling)

In [13]:
data['I-E'].value_counts()/len(data)*100

I-E
I    76.956772
E    23.043228
Name: count, dtype: float64

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer for 'I-E' dimension
vectorizer_I_E = TfidfVectorizer(ngram_range=(1,1))
X_train_vec_I_E = vectorizer_I_E.fit_transform(X_train_I_E['clean'])

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

In [103]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=['I', 'E'], y=y_train_I_E)

model_IELg = Pipeline([('TFID', vectorizer_I_E),
                       ('clf', LogisticRegression(class_weight={'I':class_weights[0],'E':class_weights[1]}))
                       ])

model_IELg.fit(X_train_I_E['clean'],y_train_I_E)
predIELg = model_IELg.predict(X_test_I_E['clean'])
print(f1_score(y_test_I_E,predIELg, average = 'macro'))
print(classification_report(y_test_I_E,predIELg))


0.7353647062282695
              precision    recall  f1-score   support

           E       0.58      0.61      0.60       400
           I       0.88      0.87      0.87      1335

    accuracy                           0.81      1735
   macro avg       0.73      0.74      0.74      1735
weighted avg       0.81      0.81      0.81      1735



## N-S (Vectorizer and modelling)

In [77]:
data['N-S'].value_counts()/len(data)*100

N-S
N    86.201729
S    13.798271
Name: count, dtype: float64

In [78]:
# Create a TfidfVectorizer for 'N-S' dimension
vectorizer_N_S = TfidfVectorizer(ngram_range=(1,1))
X_train_vec_N_S = vectorizer_N_S.fit_transform(X_train_N_S['clean'])


In [106]:
class_weights = compute_class_weight('balanced', classes=['N', 'S'], y=y_train_N_S)

model_NSLg = Pipeline([('TFID', vectorizer_N_S),
                       ('clf', LogisticRegression(class_weight={'N':class_weights[0],'S':class_weights[1]}))
                       ])

model_NSLg.fit(X_train_N_S['clean'],y_train_N_S)
predNSLg = model_NSLg.predict(X_test_N_S['clean'])
print(f1_score(y_test_N_S,predNSLg, average = 'macro'))
print(classification_report(y_test_N_S,predNSLg))

0.7050213449306504
              precision    recall  f1-score   support

           N       0.92      0.92      0.92      1496
           S       0.50      0.48      0.49       239

    accuracy                           0.86      1735
   macro avg       0.71      0.70      0.71      1735
weighted avg       0.86      0.86      0.86      1735



## T-F (Vectorizer and modelling)

In [81]:
data['T-F'].value_counts()/len(data)*100

T-F
F    54.10951
T    45.89049
Name: count, dtype: float64

In [95]:
# Create a TfidfVectorizer for 'T-F' dimension
vectorizer_T_F = TfidfVectorizer(ngram_range=(1,1))
X_train_vec_T_F = vectorizer_T_F.fit_transform(X_train_T_F['clean'])


In [108]:
class_weights = compute_class_weight('balanced', classes=['T', 'F'], y=y_train_T_F)

model_TFLg = Pipeline([('TFID', vectorizer_T_F),
                       ('clf', LogisticRegression(class_weight={'T':class_weights[0],'F':class_weights[1]}))
                       ])

model_TFLg.fit(X_train_T_F['clean'],y_train_T_F)
predTFLg = model_TFLg.predict(X_test_T_F['clean'])
print(f1_score(y_test_T_F,predTFLg, average = 'macro'))
print(classification_report(y_test_T_F,predTFLg))

0.8187001079342886
              precision    recall  f1-score   support

           F       0.84      0.82      0.83       939
           T       0.80      0.82      0.81       796

    accuracy                           0.82      1735
   macro avg       0.82      0.82      0.82      1735
weighted avg       0.82      0.82      0.82      1735



## J-P (Vectorizer and modelling)

In [85]:
data['J-P'].value_counts()/len(data)*100

J-P
P    60.414986
J    39.585014
Name: count, dtype: float64

In [86]:
# Create a TfidfVectorizer for 'J-P' dimension
vectorizer_J_P = TfidfVectorizer(ngram_range=(1,1))
X_train_vec_J_P = vectorizer_J_P.fit_transform(X_train_J_P['clean'])


In [110]:
class_weights = compute_class_weight('balanced', classes=['J', 'P'], y=y_train_J_P)

model_JPLg = Pipeline([('TFID', vectorizer_J_P),
                       ('clf', LogisticRegression(class_weight={'J':class_weights[0],'P':class_weights[1]}))
                       ])

model_JPLg.fit(X_train_J_P['clean'],y_train_J_P)
predJPLg = model_JPLg.predict(X_test_J_P['clean'])
print(f1_score(y_test_J_P,predJPLg, average = 'macro'))
print(classification_report(y_test_J_P,predJPLg))

0.7169897103573888
              precision    recall  f1-score   support

           J       0.66      0.66      0.66       687
           P       0.78      0.77      0.78      1048

    accuracy                           0.73      1735
   macro avg       0.72      0.72      0.72      1735
weighted avg       0.73      0.73      0.73      1735



## The MBTI Prediction

In [112]:
new_post_text = input()
post = [prepro(new_post_text)]  # Data preprocessing steps


# Make predictions for each dimension (I-E, N-S, T-F, J-P) separately
prediction_I_E = model_IELg.predict(post)
prediction_N_S = model_NSLg.predict(post)
prediction_T_F = model_TFLg.predict(post)
prediction_J_P = model_JPLg.predict(post)

# Combine predictions into the final MBTI type
final_mbti_type = [prediction_I_E[0]+ prediction_N_S[0]+ prediction_T_F[0]+ prediction_J_P[0]]

# Print the predicted MBTI type
print(f"Predicted MBTI Type: {final_mbti_type}")


Predicted MBTI Type: ['INFP']
