In [2]:
import pandas as pd
import warnings

warnings.filterwarnings(action="ignore", message=r'.*Use subset.*of np.ndarray is not recommended')
warnings.filterwarnings('always')  # "error", "ignore", "always", "default", "module" or "once"
warnings.filterwarnings("ignore")

from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
df = pd.read_csv('input/mbti_data_clean.csv')

In [4]:
# Build the vocabulary from 1500 words that are not common words or MBTI personalities
vectorizer = CountVectorizer(stop_words = ['and','the','to','of','infj','entp','intp','intj','entj','enfj','infp','enfp','isfp','istp','isfj','istj','estp','esfp','estj','esfj','infjs','entps','intps','intjs','entjs','enfjs','infps','enfps','isfps','istps','isfjs','istjs','estps','esfps','estjs','esfjs'], max_features=1500, analyzer='word', max_df=0.8, min_df=0.1)

corpus = df['clean_posts'].values.reshape(1,-1).tolist()[0]

vectorizer.fit(corpus)

X_cnt = vectorizer.fit_transform(corpus)

# Transform the count matrix to a tf-idf representation
tfizer = TfidfTransformer()
tfizer.fit(X_cnt)

X = tfizer.fit_transform(X_cnt).toarray()

all_words = vectorizer.get_feature_names()

n_words = len(all_words)

X_df = pd.DataFrame.from_dict({w: X[:, i] for i, w in enumerate(all_words)})

In [5]:
def balance_random_oversample(X_train, y_train):
    oversampler = RandomOverSampler(sampling_strategy=0.9, random_state=0)
    
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    
    return X_resampled, y_resampled

def balance_smote_oversample(X_train, y_train):
    oversampler = SMOTE(random_state=0)
    
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    
    return X_resampled, y_resampled

def balance_smote_oversample_random_undersample(X_train, y_train):
    oversampler = SMOTE(sampling_strategy=0.9, random_state=0) 
    undersampler = RandomUnderSampler(sampling_strategy=0.9, random_state=0)

    # Applying oversampler to oversample the minority class
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

    # Applying undersampler to reduce the majority class
    X_resampled, y_resampled = undersampler.fit_resample(X_resampled, y_resampled)

    return X_resampled, y_resampled

def balance_borderlinesmote_oversample(X_train, y_train):
    oversampler = BorderlineSMOTE(random_state=0)
    
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    
    return X_resampled, y_resampled

def balance_smoteenn_oversample_random_undersample(X_train, y_train):
    oversampler = SMOTEENN(sampling_strategy='auto', random_state=42)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=0)
    
    # Apply SMOTEENN (combination of SMOTE and ENN)
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

    # Apply random undersampling
    X_resampled, y_resampled = undersampler.fit_resample(X_resampled, y_resampled)

    return X_resampled, y_resampled

def balance_smoteen_adasyn_oversample_random_undersample(X_train, y_train):
    oversampler_1 = SMOTEENN(sampling_strategy='auto', random_state=42)
    oversampler_2 = ADASYN(sampling_strategy=0.6, random_state=0, n_neighbors=5)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=0)

    # Balance the dataset using a combination of SMOTE and ENN
    X_resampled, y_resampled = oversampler_1.fit_resample(X_train, y_train)

    # Apply ADASYN for additional oversampling
    X_resampled, y_resampled = oversampler_2.fit_resample(X_resampled, y_resampled)

    # Apply additional undersampling to reduce the majority class size
    X_resampled, y_resampled = undersampler.fit_resample(X_resampled, y_resampled)

    return X_resampled, y_resampled

In [6]:
classifiers = {
    'Dummy': lambda: DummyClassifier(strategy='most_frequent', random_state=42),
    'SVM': lambda: SVC(probability=True, random_state=42),
    'LGBM': lambda: LGBMClassifier(random_state=42),
    'KNeighbors': lambda: KNeighborsClassifier(),    
    'DecisionTree': lambda: DecisionTreeClassifier(random_state=42),
    'RandomForest': lambda: RandomForestClassifier(random_state=42),
    'AdaBoost': lambda: AdaBoostClassifier(),
    'GradientBoosting': lambda: GradientBoostingClassifier(),
    'GaussianNB': lambda: GaussianNB(),
    'LogisticRegression': lambda: LogisticRegression(random_state=42),
    'XGB': lambda: XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'VotingClassifier': lambda: VotingClassifier(estimators=[
        ('SVM', SVC(probability=True, random_state=42)),
        ('RandomForest', RandomForestClassifier(random_state=42)),
        ('LogisticRegression', LogisticRegression(random_state=42)),
        ('XGB', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ], voting='soft')
}

In [7]:
resamplers = {
  'RandomOverSampler': 'balance_random_oversample',
  'SMOTE': 'balance_smote_oversample',
  'SMOTE + RandomUnderSampler': 'balance_smote_oversample_random_undersample',
  'BorderlineSMOTE': 'balance_borderlinesmote_oversample',
  'SMOTEENN + RandomUnderSampler': 'balance_smoteenn_oversample_random_undersample',
  'SMOTEENN & ADASYN + RandomUnderSampler': 'balance_smoteen_adasyn_oversample_random_undersample'
}

In [8]:
def train_test_split_data(class_name, resampler=None):
  y_df = df[class_name]
  
  if resampler == None:
    X_cl = X_df
    y_cl = y_df
  else:
    rs = resamplers[resampler]

    X_cl, y_cl = globals()[rs](X_df, y_df)

  X_cl_train, X_cl_test, y_cl_train, y_cl_test = train_test_split(X_cl, y_cl, test_size=0.2, random_state=42, stratify=y_cl)

  return X_cl_train, X_cl_test, y_cl_train, y_cl_test

In [9]:
# Define class model based on classifer and balancer
def classifier_model(class_name, classifier, resampler=None):
    X_train, X_test, y_train, y_test = train_test_split_data(class_name, resampler)
    
    model_classifier = classifiers[classifier]()

    model = model_classifier.fit(X_train, y_train)

    return model 

In [10]:
def test_mbti(model, text):
    final_test = tfizer.transform(vectorizer.transform([text.lower()])).toarray()

    test_point = pd.DataFrame.from_dict({w: final_test[:, i] for i, w in enumerate(all_words)})

    test_result = model.predict_proba(test_point)

    return test_result

In [54]:
# Best performed models
ei_model = classifier_model('E_I', 'RandomForest', 'RandomOverSampler')
sn_model = classifier_model('S_N', 'RandomForest', 'RandomOverSampler')
tf_model = classifier_model('T_F', 'SVM')
jp_model = classifier_model('J_P', 'VotingClassifier', 'SMOTE')

In [61]:
# Models test
helpdesk_topic = "To Whom It May Concern, I am writing today to complain of the poor service I received from your company on June 12, 2023. I was visited by a representative of That Awful Company, Mr. Madman, at my home on that day. I trust this is not the way That Awful Company wishes to conduct business with valued customers—I have been with you since the company was founded and have never encountered such treatment before. I would welcome the opportunity to discuss matters further and to learn of how you propose to prevent a similar situation from recurring. I look forward to hearing from you. Yours faithfully, Customer"

# E_I model test
ei_class_res = test_mbti(ei_model, helpdesk_topic)
ei_class_str = 'E' if (ei_class_res[0][0] < ei_class_res[0][1]) else 'I'

print(ei_class_res)
print(ei_class_str)

# S_N model test
sn_class_res = test_mbti(sn_model, helpdesk_topic)
sn_class_str = 'S' if (sn_class_res[0][0] < sn_class_res[0][1]) else 'N'

print(sn_class_res)
print(sn_class_str)

# model test
tf_class_res = test_mbti(tf_model, helpdesk_topic)
tf_class_str = 'T' if (tf_class_res[0][0] < tf_class_res[0][1]) else 'F'

print(tf_class_res)
print(tf_class_str)

# model test
jp_class_res = test_mbti(jp_model, helpdesk_topic)
jp_class_str = 'J' if (jp_class_res[0][0] < jp_class_res[0][1]) else 'P'

print(jp_class_res)
print(jp_class_str)

[[0.85 0.15]]
I
[[0.71 0.29]]
N
[[0.71456327 0.28543673]]
F
[[0.46536414 0.53463586]]
J


In [62]:
#####################################
# Test class models on cleaned data #
#####################################

test_df = pd.read_csv('input/mbti_data_clean.csv')

test_mbti_classifications = []

correct_types = 0

for i in range(len(test_df)):
    t = test_df.iloc[i]['clean_posts']
    type = test_df.iloc[i]['type']

    ei = test_mbti(ei_model, t)
    ei_str = 'E' if (ei[0][0] < ei[0][1]) else 'I'

    sn = test_mbti(sn_model, t)
    sn_str = 'S' if sn[0][0] < sn[0][1] else 'N'

    tf = test_mbti(tf_model, t)
    tf_str = 'T' if tf[0][0] < tf[0][1] else 'F'

    jp = test_mbti(jp_model, t)
    jp_str = 'J' if jp[0][0] < jp[0][1] else 'P'

    mbti_type = ei_str + sn_str + tf_str + jp_str

    test_mbti_classifications.append(mbti_type)

    if mbti_type == type:
        correct_types = correct_types + 1

print('Correctly Recognized MBTI Types: {:0.1f}% ({:n}/{:n})'.format(correct_types/len(test_df)*100, correct_types, len(test_df)))

Correctly Recognized MBTI Types: 85.6% (7422/8675)


In [65]:
#################################
# Test class models on raw data #
#################################

df_orig = pd.read_csv('input/mbti_data.csv')

test_df = df_orig.copy()

test_mbti_classifications = []

correct_types = 0

for i in range(len(test_df)):
    t = test_df.iloc[i]['posts']
    type = test_df.iloc[i]['type']

    ei = test_mbti(ei_model, t)
    ei_str = 'E' if (ei[0][0] < ei[0][1]) else 'I'

    sn = test_mbti(sn_model, t)
    sn_str = 'S' if sn[0][0] < sn[0][1] else 'N'

    tf = test_mbti(tf_model, t)
    tf_str = 'T' if tf[0][0] < tf[0][1] else 'F'

    jp = test_mbti(jp_model, t)
    jp_str = 'J' if jp[0][0] < jp[0][1] else 'P'

    mbti_type = ei_str + sn_str + tf_str + jp_str

    test_mbti_classifications.append(mbti_type)

    if mbti_type == type:
        correct_types = correct_types + 1

print('Correctly Recognized MBTI Types: {:0.1f}% ({:n}/{:n})'.format(correct_types/len(test_df)*100, correct_types, len(test_df)))

Correctly Recognized MBTI Types: 83.9% (7276/8675)
