In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

df = pd.read_csv('input/clean_data.csv')

df.head()

In [None]:
df['type'].value_counts()

In [None]:
vectorizer = CountVectorizer(stop_words = ['and','the','to','of','infj','entp','intp','intj','entj','enfj','infp','enfp','isfp','istp','isfj','istj','estp','esfp','estj','esfj','infjs','entps','intps','intjs','entjs','enfjs','infps','enfps','isfps','istps','isfjs','istjs','estps','esfps','estjs','esfjs'], max_features=1500, analyzer="word", max_df=0.8, min_df=0.1)

corpus = df['clean_posts'].values.reshape(1,-1).tolist()[0]

vectorizer.fit(corpus)

X_cnt = vectorizer.fit_transform(corpus)

X_cnt

In [None]:
# Transform the count matrix to a tf-idf representation
tfizer = TfidfTransformer()
tfizer.fit(X_cnt)

X = tfizer.fit_transform(X_cnt).toarray()

In [None]:
all_words = vectorizer.get_feature_names()

n_words = len(all_words)

n_words

In [None]:
X_df = pd.DataFrame.from_dict({w: X[:, i] for i, w in enumerate(all_words)})

X_df

In [None]:
# CLASSIFIERS GENERAL TEST BEGIN

In [None]:
classifiers = {
    "DummyClassifier most_frequent": DummyClassifier(strategy='most_frequent', random_state=0),
    "LGBMClassifier": LGBMClassifier(),
    "KNeighborsClassifier": KNeighborsClassifier(3),    
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "GaussianNB": GaussianNB(),
    "LogisticRegression": LogisticRegression(solver = 'lbfgs', max_iter=1000),
    "XGBClassifier": XGBClassifier(use_label_encoder=False),
}

In [None]:

def sub_classifiers_basic_test(keyword):
    df_models = pd.DataFrame(columns=['model', 'run_time', 'avg_pre', 'avg_pre_std', 'accuracy', 'auc'])

    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df, 
        stratify=y_df)

    for key in classifiers:

        print('*',key)

        start_time = time.time()

        classifier = classifiers[key]

        model = classifier.fit(
            X_train, 
            y_train,
            )

        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
        scorer = make_scorer(average_precision_score)

        cv_scores = cross_val_score(model, X_test, y_test, cv=5, scoring=scorer)
        
        y_pred = model.predict(X_test)

        row = {
            'model': key,
            'run_time': format(round((time.time() - start_time)/60,2)),
            'avg_pre': cv_scores.mean(),
            'avg_pre_std': cv_scores.std(),
            'accuracy': accuracy_score(y_test, model.predict(X_test)),
            'auc': roc_auc_score(y_test, model.predict(X_test))
        }

        df_models = df_models.append(row, ignore_index=True)

    return df_models

In [None]:
sn_classifiers_basic_test = sub_classifiers_basic_test("S_N")

In [None]:
sn_classifiers_basic_test.head(10).sort_values(by='accuracy')

In [None]:
test_string = 'I like to observe, think, and analyze to find cons and pros. Based on my analysis, I like to create a solution based on cost effective analysis to maximize the resource to improve the performance. I like talking to my friends. I like to read and learn. I simulate a lot of different situations to see how I would react. I read or watch a lot to improve myself. I love talking to them and seeing what they have been up to. I have a variety of friends, and I appreciate they all experience different things. Listening to their emotion, experience, and life is always great.'

In [None]:
def test_mbti(model, text):
    final_test = tfizer.transform(vectorizer.transform([text.lower()])).toarray()

    test_point = pd.DataFrame.from_dict({w: final_test[:, i] for i, w in enumerate(all_words)})

    test_result = model.predict_proba(test_point)

    return test_result

In [None]:
df_result = pd.DataFrame(columns=['model', 'tp', 'tn', 'fp', 'fn', 'correct', 'incorrect', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc','avg_pre'])

In [None]:

def sub_classifier_test_01_xbg(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        stratify=y_df)

    classifier = XGBClassifier(use_label_encoder=False)
    
    model = classifier.fit(
        X_train, 
        y_train,
        early_stopping_rounds = 10, 
        eval_metric="logloss", 
        eval_set=[(X_test, y_test)], verbose=False)
    
    y_pred = model.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    avg_precision = average_precision_score(y_test, y_pred)

    df_result_row = {
            'model': 'XGBClassifier simple',
            'tp': tp,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'correct': tp+tn,
            'incorrect': fp+fn,
            'accuracy': round(accuracy,3),
            'precision': round(precision,3),
            'recall': round(recall,3),
            'f1': round(f1,3),
            'roc_auc': round(roc_auc,3),
            'avg_pre': round(avg_precision,3),
        }
    
    return df_result_row

In [None]:
def sub_classifier_model_01_xbg(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        stratify=y_df)

    classifier = XGBClassifier(use_label_encoder=False)
    
    model = classifier.fit(
        X_train, 
        y_train,
        early_stopping_rounds = 10, 
        eval_metric="logloss", 
        eval_set=[(X_test, y_test)], verbose=False)
    
    return model

In [None]:
# test table row
df_result_row_1 = sub_classifier_test_01_xbg("S_N")

In [None]:
df_result = df_result.append(df_result_row_1, ignore_index=True)
df_result.head()

In [None]:
# classifier model
sn_classifier_model_01_xbg = sub_classifier_model_01_xbg('S_N')

In [None]:
# classifier test
test_mbti(sn_classifier_model_01_xbg, test_string)

In [None]:
test_mbti(sn_classifier_model_01_xbg, 'Not sure what to say! I am pretty nervous since I am waiting here for hours.')

In [None]:
def sub_classifier_test_02_xbg(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    classifier = XGBClassifier(random_state=123)
    
    model = classifier.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    avg_precision = average_precision_score(y_test, y_pred)

    df_result_row = {
            'model': 'XGBClassifier add params',
            'tp': tp,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'correct': tp+tn,
            'incorrect': fp+fn,
            'accuracy': round(accuracy,3),
            'precision': round(precision,3),
            'recall': round(recall,3),
            'f1': round(f1,3),
            'roc_auc': round(roc_auc,3),
            'avg_pre': round(avg_precision,3),
        }
    
    return df_result_row

In [None]:
def sub_classifier_model_02_xbg(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    classifier = XGBClassifier(random_state=123)
    
    model = classifier.fit(X_train, y_train)
    
    return model

In [None]:
#test table row
df_result_row_2 = sub_classifier_test_02_xbg("S_N")

In [None]:
df_result = df_result.append(df_result_row_2, ignore_index=True)
df_result.head()

In [None]:
# classifier model
sn_classifier_model_02_xbg = sub_classifier_model_02_xbg('S_N')

In [None]:
# classifier test
test_mbti(sn_classifier_model_02_xbg, test_string)

In [None]:
def sub_classifier_test_03_xbg_smote_over(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    oversampled = SMOTE(random_state=0)
    
    X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

    classifier = XGBClassifier(random_state=222)
    
    model = classifier.fit(X_train_smote, y_train_smote)
    
    y_pred_smote = model.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_smote).ravel()
    accuracy = accuracy_score(y_test, y_pred_smote)
    precision = precision_score(y_test, y_pred_smote)
    recall = recall_score(y_test, y_pred_smote)
    f1 = f1_score(y_test, y_pred_smote)
    roc_auc = roc_auc_score(y_test, y_pred_smote)
    avg_precision = average_precision_score(y_test, y_pred_smote)

    df_result_row = {
            'model': 'XGBClassifier + SMOTE oversample',
            'tp': tp,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'correct': tp+tn,
            'incorrect': fp+fn,
            'accuracy': round(accuracy,3),
            'precision': round(precision,3),
            'recall': round(recall,3),
            'f1': round(f1,3),
            'roc_auc': round(roc_auc,3),
            'avg_pre': round(avg_precision,3),
        }
    
    return df_result_row

In [None]:
def sub_classifier_model_03_xbg_smote_over(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    oversampled = SMOTE(random_state=0)
    
    X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

    classifier = XGBClassifier(random_state=222)
    
    model = classifier.fit(X_train_smote, y_train_smote)

    return model

In [None]:
#test table row
df_result_row_3 = sub_classifier_test_03_xbg_smote_over("S_N")

In [None]:
df_result = df_result.append(df_result_row_3, ignore_index=True)
df_result.head()

In [None]:
# classifier model
sn_classifier_model_03_xbg_smote_over = sub_classifier_model_03_xbg_smote_over('S_N')

In [None]:
# classifier test
test_mbti(sn_classifier_model_03_xbg_smote_over, test_string)

In [None]:
# XBG test 3 END

# XBG test 4 BEGIN

In [None]:
def sub_classifier_test_04_xbg_smote_over_under(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    oversampled = SMOTE(
        sampling_strategy=0.6, 
        random_state=0,
        k_neighbors=4)
    
    X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

    undersampled = RandomUnderSampler(
        sampling_strategy=0.7, 
        random_state=0)

    X_train_final, y_train_final = undersampled.fit_resample(X_train_smote, y_train_smote)

    classifier = XGBClassifier(random_state=222)
    
    model = classifier.fit(X_train_final, y_train_final)
    
    y_pred_smote = model.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_smote).ravel()
    accuracy = accuracy_score(y_test, y_pred_smote)
    precision = precision_score(y_test, y_pred_smote)
    recall = recall_score(y_test, y_pred_smote)
    f1 = f1_score(y_test, y_pred_smote)
    roc_auc = roc_auc_score(y_test, y_pred_smote)
    avg_precision = average_precision_score(y_test, y_pred_smote)

    df_result_row = {'model': 'XGBClassifier + SMOTE over/undersample',
            'tp': tp,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'correct': tp+tn,
            'incorrect': fp+fn,
            'accuracy': round(accuracy,3),
            'precision': round(precision,3),
            'recall': round(recall,3),
            'f1': round(f1,3),
            'roc_auc': round(roc_auc,3),
            'avg_pre': round(avg_precision,3),
        }
    
    return df_result_row

In [None]:
def sub_classifier_model_04_xbg_smote_over_under(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    oversampled = SMOTE(
        sampling_strategy=0.6, 
        random_state=0,
        k_neighbors=4)
    
    X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

    undersampled = RandomUnderSampler(
        sampling_strategy=0.7, 
        random_state=0)

    X_train_final, y_train_final = undersampled.fit_resample(X_train_smote, y_train_smote)

    classifier = XGBClassifier(random_state=222)
    
    model = classifier.fit(X_train_final, y_train_final)

    return model

In [None]:
#test table row
df_result_row_4 = sub_classifier_test_04_xbg_smote_over_under("S_N")

df_result = df_result.append(df_result_row_4, ignore_index=True)
df_result.head()

In [None]:
# classifier model
sn_classifier_model_04_xbg_smote_over_under = sub_classifier_model_04_xbg_smote_over_under('S_N')

In [None]:
# classifier test
test_mbti(sn_classifier_model_04_xbg_smote_over_under, test_string)

In [None]:
# XBG test 4 END

# XBG test 5 BEGIN

In [None]:
def sub_classifier_test_05_xbg_smote_borderline(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    oversampled = BorderlineSMOTE(
        random_state=0) 
    
    X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

    classifier = XGBClassifier(random_state=222)
    
    model = classifier.fit(X_train_smote, y_train_smote)
    
    y_pred_smote = model.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_smote).ravel()
    accuracy = accuracy_score(y_test, y_pred_smote)
    precision = precision_score(y_test, y_pred_smote)
    recall = recall_score(y_test, y_pred_smote)
    f1 = f1_score(y_test, y_pred_smote)
    roc_auc = roc_auc_score(y_test, y_pred_smote)
    avg_precision = average_precision_score(y_test, y_pred_smote)

    df_result_row = {'model': 'XGBClassifier with Borderline SMOTE',
            'tp': tp,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'correct': tp+tn,
            'incorrect': fp+fn,
            'accuracy': round(accuracy,3),
            'precision': round(precision,3),
            'recall': round(recall,3),
            'f1': round(f1,3),
            'roc_auc': round(roc_auc,3),
            'avg_pre': round(avg_precision,3),
        }
    
    return df_result_row

In [None]:
def sub_classifier_model_05_xbg_smote_borderline(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    oversampled = BorderlineSMOTE(
        random_state=0) 
    
    X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

    classifier = XGBClassifier(random_state=222)
    
    model = classifier.fit(X_train_smote, y_train_smote)

    return model

In [None]:
#test table row
df_result_row_5 = sub_classifier_test_05_xbg_smote_borderline("S_N")

df_result = df_result.append(df_result_row_5, ignore_index=True)
df_result.head()

In [None]:
# classifier model
sn_classifier_model_05_xbg_smote_borderline = sub_classifier_model_05_xbg_smote_borderline('S_N')

In [None]:
# classifier test
test_mbti(sn_classifier_model_05_xbg_smote_borderline, test_string)

In [None]:
df_result.to_csv('xgb_classifier_test.csv')

In [None]:
# XGBoost + SMOTE over/undersample Test on all 4 classes BEGIN

In [None]:
df_all_result = pd.DataFrame(columns=['class', 'tp', 'tn', 'fp', 'fn', 'correct', 'incorrect', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc','avg_pre'])

In [None]:
def sub_classifier_test_06_xbg_smote_over_under(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    oversampled = SMOTE(
        sampling_strategy=0.6, 
        random_state=0,
        k_neighbors=4)
    
    X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

    undersampled = RandomUnderSampler(
        sampling_strategy=0.7, 
        random_state=0)

    X_train_final, y_train_final = undersampled.fit_resample(X_train_smote, y_train_smote)

    classifier = XGBClassifier(random_state=222)
    
    model = classifier.fit(X_train_final, y_train_final)
    
    y_pred_smote = model.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_smote).ravel()
    accuracy = accuracy_score(y_test, y_pred_smote)
    precision = precision_score(y_test, y_pred_smote)
    recall = recall_score(y_test, y_pred_smote)
    f1 = f1_score(y_test, y_pred_smote)
    roc_auc = roc_auc_score(y_test, y_pred_smote)
    avg_precision = average_precision_score(y_test, y_pred_smote)

    df_result_row = {
            'class': keyword,
            'tp': tp,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'correct': tp+tn,
            'incorrect': fp+fn,
            'accuracy': round(accuracy,3),
            'precision': round(precision,3),
            'recall': round(recall,3),
            'f1': round(f1,3),
            'roc_auc': round(roc_auc,3),
            'avg_pre': round(avg_precision,3),
        }
    
    return df_result_row

In [None]:
def sub_classifier_test_07_xbg_smote_over(keyword):
    y_df = df[keyword].values

    X_train, X_test, y_train, y_test = train_test_split(
        X_df, 
        y_df,
        test_size=0.3, 
        random_state=0, 
        shuffle=True, 
        stratify=y_df)

    oversampled = SMOTE(random_state=0)
    
    X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)

    classifier = XGBClassifier(random_state=222)
    
    model = classifier.fit(X_train_smote, y_train_smote)
    
    y_pred_smote = model.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_smote).ravel()
    accuracy = accuracy_score(y_test, y_pred_smote)
    precision = precision_score(y_test, y_pred_smote)
    recall = recall_score(y_test, y_pred_smote)
    f1 = f1_score(y_test, y_pred_smote)
    roc_auc = roc_auc_score(y_test, y_pred_smote)
    avg_precision = average_precision_score(y_test, y_pred_smote)

    df_result_row = {
            'class': keyword,
            'tp': tp,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'correct': tp+tn,
            'incorrect': fp+fn,
            'accuracy': round(accuracy,3),
            'precision': round(precision,3),
            'recall': round(recall,3),
            'f1': round(f1,3),
            'roc_auc': round(roc_auc,3),
            'avg_pre': round(avg_precision,3),
        }
    
    return df_result_row

In [None]:
# E_I test table row
df_ei_result_row = sub_classifier_test_06_xbg_smote_over_under("E_I")

df_all_result = df_all_result.append(df_ei_result_row, ignore_index=True)

In [None]:
# S_N test table row
df_sn_result_row = sub_classifier_test_06_xbg_smote_over_under("S_N")

df_all_result = df_all_result.append(df_sn_result_row, ignore_index=True)

In [None]:
# T_F test table row
df_tf_result_row = sub_classifier_test_07_xbg_smote_over("T_F")

df_all_result = df_all_result.append(df_tf_result_row, ignore_index=True)

In [None]:
# J_P test table row
df_jp_result_row = sub_classifier_test_07_xbg_smote_over("J_P")

df_all_result = df_all_result.append(df_jp_result_row, ignore_index=True)

In [None]:
df_all_result.head()

In [None]:
df_all_result.to_csv('xgb_classifier_all_test.csv')

In [None]:
# XGBoost + SMOTE over/undersample Test on all 4 classes END

In [None]:
ei_classifier_model = sub_classifier_model_04_xbg_smote_over_under('E_I')

In [None]:
sn_classifier_model = sub_classifier_model_04_xbg_smote_over_under('S_N')

In [None]:
tf_classifier_model = sub_classifier_model_03_xbg_smote_over('T_F')

In [None]:
jp_classifier_model = sub_classifier_model_03_xbg_smote_over('J_P')

In [None]:
helpdesk_topic = "To Whom It May Concern, I am writing today to complain of the poor service I received from your company on June 12, 2020. I was visited by a representative of That Awful Company, Mr. Madman, at my home on that day. I trust this is not the way That Awful Company wishes to conduct business with valued customers—I have been with you since the company was founded and have never encountered such treatment before. I would welcome the opportunity to discuss matters further and to learn of how you propose to prevent a similar situation from recurring. I look forward to hearing from you. Yours faithfully, Customer"

In [None]:
test_mbti(ei_classifier_model, helpdesk_topic)

In [None]:
test_mbti(sn_classifier_model, helpdesk_topic)

In [None]:
test_mbti(tf_classifier_model, helpdesk_topic)

In [None]:
test_mbti(jp_classifier_model, helpdesk_topic)