<a href="https://colab.research.google.com/github/xjdeng/mbtimodel/blob/main/mbti_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!git clone https://github.com/xjdeng/mbtimodel
!pip install -r mbtimodel/requirements.txt

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
import numpy as np
import string
table = str.maketrans('', '', string.punctuation)
import time
from imblearn.over_sampling import RandomOverSampler
import sklearn
import joblib
print(sklearn.__version__)

In [None]:
with open("mbtimodel/english10000.txt",'r') as f:
    words = f.read().split("\n")
wordset = set(words)
indexdict = {None: 0}
for i,w in enumerate(words):
    indexdict[w] = i
def get_distribution(txt):
    txtwords = txt.translate(table).lower().split()
    dist = [0]*(len(words))
    for w in txtwords:
        if w in wordset:
            dist[indexdict[w]] += 1
    tot = sum(dist)
    if tot == 0:
        return [0]*(len(words))
    return [d/tot for d in dist]

In [None]:
mbti = pd.read_csv("mbtimodel/mbti.zip")
mbti.head()

In [None]:
types = set(mbti['type'])
for t in types:
  print(f"Type: {t}, Count: {len(mbti[mbti['type'] == t])}")

In [None]:
x = np.array([get_distribution(p) for p in mbti['posts']])
y = np.array(mbti['type'])


In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
X,Y = oversample.fit_resample(x, y)
rf = GradientBoostingClassifier()
rf.fit(X,Y)

In [None]:
joblib.dump(rf,"/content/drive/MyDrive/mbtimodel.pkl")

In [None]:
def get_prediction(txt):
  dist = get_distribution(txt)
  return rf.predict([dist])[0]

In [None]:
get_prediction("Hello World")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,
                                                    random_state=0, stratify=y)

In [None]:
oversample2 = RandomOverSampler(sampling_strategy='minority')
x_train2, y_train2 = oversample2.fit_resample(x_train, y_train)

In [None]:
rf2 = GradientBoostingClassifier()
rf2.fit(x_train2, y_train2)

In [None]:
y_pred = rf2.predict(x_test)

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_pred, y_test, average='micro')

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=rf2.classes_)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=rf2.classes_)

In [None]:
disp.plot()