In [13]:
import pandas as pd 
import numpy as np
import os
import pickle

In [14]:
dataset = './mbti_1.csv'
df = pd.read_csv(dataset, sep=',')
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [16]:
df.describe()

Unnamed: 0,type,posts
count,8675,8675
unique,16,8675
top,INFP,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
freq,1832,1


In [17]:
df

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [None]:
import re
from sklearn.model_selection import train_test_split

def enhanced_clean(text):
    # Remove URLs/special chars
    text = re.sub(r'http\S+|www\S+|https\S+|\|\|\|', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    words = text.lower().split()
    
    stemmed = []
    for word in words:
        if word.endswith('ing'):
            word = word[:-3]
        elif word.endswith('ly'):
            word = word[:-2]
        elif word.endswith('s'):
            word = word[:-1]
        stemmed.append(word)
    
    return ' '.join(stemmed)

df['cleaned_posts'] = df['posts'].apply(enhanced_clean)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    ngram_range=(1, 2), 
    min_df=5,          
    max_df=0.7        
)
X = tfidf.fit_transform(df['cleaned_posts'])

In [22]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',  
    max_depth=15,
    random_state=42
)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [24]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_resampled, y_resampled, cv=5)
print(f"Cross-Validation Accuracy: {scores.mean():.2f} (±{scores.std():.2f})")

Cross-Validation Accuracy: 0.88 (±0.06)


In [25]:
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

              precision    recall  f1-score   support

        ENFJ       0.95      0.96      0.96       376
        ENFP       0.90      0.81      0.85       360
        ENTJ       0.94      0.95      0.94       401
        ENTP       0.87      0.84      0.85       344
        ESFJ       1.00      1.00      1.00       382
        ESFP       1.00      0.99      1.00       371
        ESTJ       1.00      1.00      1.00       351
        ESTP       0.99      0.97      0.98       391
        INFJ       0.81      0.58      0.67       360
        INFP       0.44      0.80      0.57       345
        INTJ       0.85      0.67      0.75       379
        INTP       0.82      0.71      0.76       344
        ISFJ       0.98      0.97      0.98       350
        ISFP       0.95      0.94      0.95       387
        ISTJ       0.94      0.96      0.95       353
        ISTP       0.94      0.95      0.95       369

    accuracy                           0.88      5863
   macro avg       0.90   

In [26]:
import joblib  

joblib.dump({
    'model': model,
    'vectorizer': tfidf,
    'cleaner': enhanced_clean
}, 'optimized_model.joblib')

['optimized_model.joblib']