# **MBTPy**
---
model_training


## Intro - overview of the dataset
---

### Data Loading
---

In [1]:
# Importing libraries

import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import re

from wordcloud import WordCloud
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.experimental import enable_hist_gradient_boosting

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



  from pandas import MultiIndex, Int64Index


In [2]:
# Load data
data=pd.read_csv('../data/raw/mbti_1.csv')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


## Preprocessing
---

### Train-test split

In [3]:
# Stratify split to ensure equal distribution of data
train_data,test_data=train_test_split(data,test_size=0.2,random_state=42,stratify=data.type)
print(train_data.shape, test_data.shape)


(6940, 2)
(1735, 2)


### Preprocessing Text

In [4]:
# Preprocessing functions

def clean_text(text):
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(text):
        sentence=sentence.lower()
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
        sentence=re.sub('[^0-9a-z]',' ',sentence)
        cleaned_text.append(sentence)
    return cleaned_text

# def lemmatize_text(text):
#     lemmatizer = WordNetLemmatizer()
#     return [lemmatizer.lemmatize(word) for word in text.split() if len(word)>2]

preprocessing_pipeline = Pipeline([
    ('clean_text', FunctionTransformer(clean_text, validate=False)),
    # ('lemmatize', FunctionTransformer(lemmatize_text, validate=False)),
    ('vectorizer', TfidfVectorizer(max_features=5000, stop_words='english')),
])


# Fitting the preprocessing pipeline on the training text
preprocessing_pipeline.fit(train_data['posts'])

# Saving the pipeline
joblib.dump(preprocessing_pipeline, '../models/preprocessing_pipeline.joblib')

# Using the preprocessing pipeline to preprocess the data
train_post = preprocessing_pipeline.transform(train_data['posts'])
test_post = preprocessing_pipeline.transform(test_data['posts'])


100%|██████████| 6940/6940 [00:01<00:00, 3596.80it/s]
100%|██████████| 6940/6940 [00:01<00:00, 3608.02it/s]
100%|██████████| 1735/1735 [00:00<00:00, 3573.42it/s]


### Encoding Target

In [5]:
# Using the LabelEncoder to encode the target variable

# Define the label encoder
target_encoder = LabelEncoder()

# Fit the encoder on the training data
target_encoder.fit(train_data.type)

# Save the encoder
joblib.dump(target_encoder, '../models/target_encoder.joblib')

# Use the encoder to preprocess the target
train_target = target_encoder.transform(train_data.type)
test_target = target_encoder.transform(test_data.type)


## Model Selection
---

In [6]:
models_accuracy={}


In [7]:
# Logistic Regression

model_log=LogisticRegression(max_iter=3000,C=0.5,n_jobs=-1)
model_log.fit(train_post,train_target)

print('train classification report \n ',classification_report(train_target,model_log.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n',classification_report(test_target,model_log.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

models_accuracy['logistic regression']=accuracy_score(test_target,model_log.predict(test_post))



train classification report 
                precision    recall  f1-score   support

        ENFJ       0.83      0.16      0.27       152
        ENFP       0.81      0.65      0.72       540
        ENTJ       0.93      0.29      0.44       185
        ENTP       0.81      0.68      0.74       548
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.00      0.00      0.00        38
        ESTJ       0.00      0.00      0.00        31
        ESTP       1.00      0.04      0.08        71
        INFJ       0.74      0.83      0.78      1176
        INFP       0.66      0.93      0.77      1466
        INTJ       0.75      0.81      0.78       873
        INTP       0.69      0.87      0.77      1043
        ISFJ       0.92      0.26      0.41       133
        ISFP       0.87      0.24      0.38       217
        ISTJ       0.84      0.25      0.38       164
        ISTP       0.87      0.51      0.64       270

    accuracy                           0.72      

In [8]:
# Linear Support Vector Classifier

model_linear_svc=LinearSVC(C=0.1)
model_linear_svc.fit(train_post,train_target)

print('train classification report \n ',classification_report(train_target,model_linear_svc.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n',classification_report(test_target,model_linear_svc.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

models_accuracy['Linear Support Vector classifier']=accuracy_score(test_target,model_linear_svc.predict(test_post))



train classification report 
                precision    recall  f1-score   support

        ENFJ       0.91      0.45      0.61       152
        ENFP       0.85      0.77      0.81       540
        ENTJ       0.93      0.64      0.76       185
        ENTP       0.84      0.82      0.83       548
        ESFJ       0.92      0.33      0.49        33
        ESFP       1.00      0.16      0.27        38
        ESTJ       1.00      0.32      0.49        31
        ESTP       0.91      0.44      0.59        71
        INFJ       0.83      0.86      0.85      1176
        INFP       0.77      0.93      0.85      1466
        INTJ       0.83      0.86      0.85       873
        INTP       0.81      0.90      0.85      1043
        ISFJ       0.92      0.68      0.78       133
        ISFP       0.90      0.59      0.71       217
        ISTJ       0.88      0.65      0.75       164
        ISTP       0.90      0.81      0.86       270

    accuracy                           0.82      

In [9]:
# Support Vector Classifier

model_svc=SVC()
model_svc.fit(train_post,train_target)

print('train classification report \n ',classification_report(train_target,model_svc.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n ',classification_report(test_target,model_svc.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

models_accuracy['Support Vector classifier']=accuracy_score(test_target,model_svc.predict(test_post))



train classification report 
                precision    recall  f1-score   support

        ENFJ       0.97      0.86      0.91       152
        ENFP       0.96      0.96      0.96       540
        ENTJ       0.99      0.90      0.94       185
        ENTP       0.95      0.96      0.96       548
        ESFJ       1.00      0.58      0.73        33
        ESFP       1.00      0.37      0.54        38
        ESTJ       1.00      0.52      0.68        31
        ESTP       1.00      0.83      0.91        71
        INFJ       0.95      0.97      0.96      1176
        INFP       0.93      0.98      0.96      1466
        INTJ       0.96      0.97      0.96       873
        INTP       0.95      0.98      0.96      1043
        ISFJ       1.00      0.88      0.94       133
        ISFP       0.97      0.90      0.94       217
        ISTJ       0.94      0.91      0.93       164
        ISTP       0.98      0.93      0.95       270

    accuracy                           0.95      

In [10]:
# Multinomial Naive Bayes

model_multinomial_nb=MultinomialNB()
model_multinomial_nb.fit(train_post,train_target)

print('train classification report \n ',classification_report(train_target,model_multinomial_nb.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n ',classification_report(test_target,model_multinomial_nb.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

models_accuracy['Multinomial Naive Bayes']=accuracy_score(test_target,model_multinomial_nb.predict(test_post))



train classification report 
                precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00       152
        ENFP       0.93      0.02      0.05       540
        ENTJ       0.00      0.00      0.00       185
        ENTP       0.92      0.08      0.15       548
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.00      0.00      0.00        38
        ESTJ       0.00      0.00      0.00        31
        ESTP       0.00      0.00      0.00        71
        INFJ       0.51      0.62      0.56      1176
        INFP       0.36      0.93      0.52      1466
        INTJ       0.78      0.44      0.56       873
        INTP       0.59      0.65      0.62      1043
        ISFJ       0.00      0.00      0.00       133
        ISFP       0.00      0.00      0.00       217
        ISTJ       0.00      0.00      0.00       164
        ISTP       0.00      0.00      0.00       270

    accuracy                           0.46      

In [11]:
# Decision Tree Classifier

model_tree=DecisionTreeClassifier(max_depth=14)
model_tree.fit(train_post,train_target)

print('train classification report \n ',classification_report(train_target,model_tree.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n ',classification_report(test_target,model_tree.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

models_accuracy['Decision Tree classifier']=accuracy_score(test_target,model_tree.predict(test_post))



train classification report 
                precision    recall  f1-score   support

        ENFJ       0.80      0.64      0.71       152
        ENFP       0.89      0.82      0.86       540
        ENTJ       0.90      0.70      0.78       185
        ENTP       0.91      0.81      0.86       548
        ESFJ       0.88      0.42      0.57        33
        ESFP       0.80      0.21      0.33        38
        ESTJ       0.73      0.35      0.48        31
        ESTP       0.88      0.41      0.56        71
        INFJ       0.83      0.86      0.84      1176
        INFP       0.67      0.94      0.78      1466
        INTJ       0.87      0.82      0.85       873
        INTP       0.87      0.81      0.84      1043
        ISFJ       0.97      0.56      0.71       133
        ISFP       0.93      0.62      0.75       217
        ISTJ       0.80      0.63      0.70       164
        ISTP       0.95      0.71      0.81       270

    accuracy                           0.81      

In [12]:
# Random Forest Classifier

model_forest=RandomForestClassifier(max_depth=10)
model_forest.fit(train_post,train_target)


print('train classification report \n ',classification_report(train_target,model_forest.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n ',classification_report(test_target,model_forest.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

models_accuracy['Random Forest Classifier']=accuracy_score(test_target,model_forest.predict(test_post))



train classification report 
                precision    recall  f1-score   support

        ENFJ       1.00      0.01      0.01       152
        ENFP       0.99      0.32      0.49       540
        ENTJ       1.00      0.04      0.07       185
        ENTP       0.99      0.46      0.62       548
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.00      0.00      0.00        38
        ESTJ       0.00      0.00      0.00        31
        ESTP       0.00      0.00      0.00        71
        INFJ       0.77      0.83      0.80      1176
        INFP       0.44      1.00      0.61      1466
        INTJ       0.88      0.73      0.80       873
        INTP       0.81      0.85      0.83      1043
        ISFJ       1.00      0.08      0.14       133
        ISFP       1.00      0.06      0.12       217
        ISTJ       1.00      0.05      0.09       164
        ISTP       1.00      0.28      0.43       270

    accuracy                           0.65      

In [13]:
# XGBoost Classifier

# model_xgb=XGBClassifier(gpu_id=0,tree_method='gpu_hist',max_depth=5,n_estimators=50,learning_rate=0.1)
model_xgb=XGBClassifier(max_depth=5,n_estimators=50,learning_rate=0.1)
model_xgb.fit(train_post,train_target)

print('train classification report \n ',classification_report(train_target,model_xgb.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n ',classification_report(test_target,model_xgb.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

models_accuracy['XGBoost Classifier']=accuracy_score(test_target,model_xgb.predict(test_post))



train classification report 
                precision    recall  f1-score   support

        ENFJ       1.00      0.95      0.98       152
        ENFP       0.95      0.92      0.94       540
        ENTJ       0.99      0.93      0.96       185
        ENTP       0.96      0.93      0.95       548
        ESFJ       1.00      0.97      0.98        33
        ESFP       1.00      0.97      0.99        38
        ESTJ       1.00      0.97      0.98        31
        ESTP       1.00      0.97      0.99        71
        INFJ       0.92      0.91      0.92      1176
        INFP       0.90      0.95      0.92      1466
        INTJ       0.93      0.92      0.93       873
        INTP       0.91      0.93      0.92      1043
        ISFJ       1.00      0.95      0.98       133
        ISFP       1.00      0.93      0.96       217
        ISTJ       0.99      0.96      0.97       164
        ISTP       0.97      0.97      0.97       270

    accuracy                           0.93      

In [14]:
# CatBoost Classifier

model_cat=CatBoostClassifier(loss_function='MultiClass',eval_metric='MultiClass',task_type='GPU',verbose=False)
model_cat.fit(train_post,train_target)

print('train classification report \n ',classification_report(train_target,model_cat.predict(train_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))
print('test classification report \n ',classification_report(test_target,model_cat.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

models_accuracy['CatBoost Classifier']=accuracy_score(test_target,model_cat.predict(test_post))



train classification report 
                precision    recall  f1-score   support

        ENFJ       0.86      0.62      0.72       152
        ENFP       0.85      0.79      0.82       540
        ENTJ       0.89      0.66      0.76       185
        ENTP       0.83      0.81      0.82       548
        ESFJ       1.00      0.52      0.68        33
        ESFP       1.00      0.39      0.57        38
        ESTJ       1.00      0.39      0.56        31
        ESTP       0.94      0.62      0.75        71
        INFJ       0.83      0.87      0.85      1176
        INFP       0.81      0.90      0.85      1466
        INTJ       0.83      0.84      0.84       873
        INTP       0.78      0.88      0.83      1043
        ISFJ       0.91      0.70      0.79       133
        ISFP       0.88      0.68      0.77       217
        ISTJ       0.90      0.74      0.81       164
        ISTP       0.90      0.80      0.85       270

    accuracy                           0.83      

In [15]:
models_accuracy

models_accuracy.keys()

accuracy=pd.DataFrame(models_accuracy.items(),columns=['Models','Test accuracy'])

accuracy.sort_values(by='Test accuracy',ascending=False,ignore_index=True).style.background_gradient(cmap='Blues')



Unnamed: 0,Models,Test accuracy
0,CatBoost Classifier,0.672046
1,Linear Support Vector classifier,0.661671
2,XGBoost Classifier,0.65879
3,Support Vector classifier,0.649568
4,logistic regression,0.628242
5,Decision Tree classifier,0.514697
6,Random Forest Classifier,0.455908
7,Multinomial Naive Bayes,0.378098


## Export
---

In [16]:
import joblib
import os

def export_models(models=[], path='../models/', active=False):
    """
    Export trained models
    - models : List of tuples (model, 'name')
    - path : directory where to save files
    - active : defines if the function should export models when called
    """
    if active :
        if not os.path.exists(path):
            os.makedirs(path)
        for model in models:
            file_path = path + model[1] + '.joblib'
            joblib.dump(model[0], file_path)
            print(f'Model saved to {file_path}')



models_list = [
    (model_log, "model_log"), # Logistic Regression
    (model_linear_svc, "model_linear_svc"), # Linear Support Vector Classifier
    (model_svc, "model_svc"), # Support Vector Classifier
    (model_multinomial_nb, "model_multinomial_nb"), # Multinomial Naive Bayes
    (model_tree, "model_tree"), # Decision Tree Classifier
    (model_forest, "model_forest"), # Random Forest Classifier
    (model_xgb, "model_xgb"), # XGBoost Classifier
    (model_cat, "model_cat"), # CatBoost Classifier
]

models_path = '../models/'

export_models(models=models_list, path=models_path, active=True)




Model saved to ./models/model_log.joblib
Model saved to ./models/model_linear_svc.joblib
Model saved to ./models/model_svc.joblib
Model saved to ./models/model_multinomial_nb.joblib
Model saved to ./models/model_tree.joblib
Model saved to ./models/model_forest.joblib
Model saved to ./models/model_xgb.joblib
Model saved to ./models/model_cat.joblib


In [17]:
# accuracy.to_pickle('../models_accuracy.pkl')
accuracy.to_csv('../results/models_accuracy.csv', index=False)



## Test prediction (cc-temp)
---

In [18]:
# # Test - prediction from csv file (one model)

# # Creating a new dataframe with the text data we want to test
# new_data = pd.read_csv('../data/raw/new_data.csv')

# # Loading the trained model and pipelines
# model = joblib.load('../models/model_svc.joblib')
# preprocessing_pipeline = joblib.load('../models/preprocessing_pipeline.joblib')
# target_encoder = joblib.load('../models/target_encoder.joblib')

# # Clean and preprocess the text data
# new_post = preprocessing_pipeline.transform(new_data['posts'])

# # Using the model to predict the personality type
# prediction = model.predict(new_post)
# print(target_encoder.inverse_transform(prediction))



In [19]:
# Test - prediction from dataframe (one model)

# Creating a new dataframe with the text data we want to test
new_data = pd.DataFrame({'posts':['World is a beautiful place full of amazing people and opportunities ! ||| Hello there !']})

# Loading the trained model and pipelines
model = joblib.load('../models/model_svc.joblib')
preprocessing_pipeline = joblib.load('../models/preprocessing_pipeline.joblib')
target_encoder = joblib.load('../models/target_encoder.joblib')

# Clean and preprocess the text data
new_post = preprocessing_pipeline.transform(new_data.posts)

# Using the model to predict the personality type
prediction = model.predict(new_post)
print(target_encoder.inverse_transform(prediction))



100%|██████████| 1/1 [00:00<00:00, 1000.07it/s]

['INFP']





In [20]:
# Test - prediction from a list of text (one model)

# new_data is the data we want to preprocess, it should be a list of text.
new_data = ["'World is a beautiful place full of amazing people and opportunities ! ||| Hello there !"]

# Loading the trained model and pipelines
model = joblib.load('../models/model_svc.joblib')
preprocessing_pipeline = joblib.load('../models/preprocessing_pipeline.joblib')
target_encoder = joblib.load('../models/target_encoder.joblib')

# Clean and preprocess the text data
new_post = preprocessing_pipeline.transform(new_data)

# Using the model to predict the personality type
prediction = model.predict(new_post)
print(target_encoder.inverse_transform(prediction))


100%|██████████| 1/1 [00:00<?, ?it/s]

['INFP']





In [None]:
# Test - encoding new data

new_target = ["INFJ"]
# Load the saved pipeline
target_encoder = joblib.load('../models/target_encoder.joblib')
# Encoding new target variable
encoded_target = target_encoder.transform(new_target)



In [21]:
# Test - prediction (all models)


# Creating a new dataframe with the text data we want to test
new_data = pd.DataFrame({'posts':['World is a beautiful place full of amazing people and opportunities ! ||| Hello there !']})

preprocessing_pipeline = joblib.load('../models/preprocessing_pipeline.joblib')
target_encoder = joblib.load('../models/target_encoder.joblib')


def test_models(data, vectorizer, target_encoder):
    models = [
        joblib.load('../models/model_log.joblib'),
        joblib.load('../models/model_linear_svc.joblib'),
        joblib.load('../models/model_svc.joblib'),
        joblib.load('../models/model_multinomial_nb.joblib'),
        joblib.load('../models/model_tree.joblib'),
        joblib.load('../models/model_forest.joblib'),
        joblib.load('../models/model_xgb.joblib'),
        joblib.load('../models/model_cat.joblib')
    ]
    model_names = [
        'Logistic Regression',
        'Linear SVC',
        'SVC',
        'Multinomial NB',
        'Decision Tree',
        'Random Forest',
        'XGBoost',
        'CatBoost'
    ]
    new_post = preprocessing_pipeline.transform(new_data)
    for i, model in enumerate(models):
        prediction = model.predict(new_post)
        print(f'{model_names[i]}: {target_encoder.inverse_transform(prediction)}')


test_models(new_data, preprocessing_pipeline, target_encoder)

100%|██████████| 1/1 [00:00<?, ?it/s]

Logistic Regression: ['INTJ']
Linear SVC: ['INTJ']
SVC: ['INTJ']
Multinomial NB: ['INFP']
Decision Tree: ['INFP']
Random Forest: ['INFP']
XGBoost: ['INTP']
CatBoost: ['INTP']



