In [1]:
import numpy as np
import pandas as pd
import pickle
import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
#import dataset and see how it looks like
df = pd.read_csv("../dataset/mbti_big5_clean.csv")
print(df.shape)
df.head()


(8672, 6)


Unnamed: 0,type,conscieniousness,agreeable,extraversion,neuroticism,openness
0,INFJ,43.52417819,58.68915806,an,95.56853657,68.2717645
1,INFJ,50.22031973,57.71273046,20.2788891,85.88187978,75.60278174
2,ENTP,37.61441377,53.04986179,19.2923702,97.24779521,76.6872109
3,INTP,55.88302211,62.36175274,33.68039782,77.35532718,73.16256345
4,INTJ,21.39582754,70.408671,56.05826003,89.56679917,72.92137874


In [8]:
# some sample has texts data
mask = df['extraversion'].str.contains('^\d*\.?\d*$', na = False)
df = df[mask]
df.head()

Unnamed: 0,type,conscieniousness,agreeable,extraversion,neuroticism,openness
1,INFJ,50.22031973,57.71273046,20.2788891,85.88187978,75.60278174
2,ENTP,37.61441377,53.04986179,19.2923702,97.24779521,76.6872109
3,INTP,55.88302211,62.36175274,33.68039782,77.35532718,73.16256345
4,INTJ,21.39582754,70.408671,56.05826003,89.56679917,72.92137874
5,ENTJ,15.7368388,47.37946064,55.7772194,96.28591317,69.24075453


In [9]:
# remove samples has text data on other traits
df = df[df['conscieniousness'].str.contains('^\d*\.?\d*$', na = False)]
df = df[df['agreeable'].str.contains('^\d*\.?\d*$', na = False)]
df = df[df['neuroticism'].str.contains('^\d*\.?\d*$', na = False)]
df = df[df['openness'].str.contains('^\d*\.?\d*$', na = False)]

In [11]:
mbti_cols = df['type'].str.extract('(?P<IE>[IE])(?P<SN>[SN])(?P<TF>[TF])(?P<PJ>[PJ])', expand = False)
mbti_cols.head()
df = mbti_cols.assign(conscieniousness=df['conscieniousness'],
                      agreeable=df['agreeable'],
                      neuroticism=df['neuroticism'],
                      openness=df['openness'],
                      extraversion=df['extraversion'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8488 entries, 1 to 8671
Data columns (total 9 columns):
IE                  8488 non-null object
SN                  8488 non-null object
TF                  8488 non-null object
PJ                  8488 non-null object
conscieniousness    8488 non-null object
agreeable           8488 non-null object
neuroticism         8488 non-null object
openness            8488 non-null object
extraversion        8488 non-null object
dtypes: object(9)
memory usage: 663.1+ KB


In [12]:
# transfer mbti to integers
mbti_dict = {'I':1, 'E':0, 'S':0, 'N':1, 'T':0, 'F':1, 'P':0, 'J':1}
mbti_dict_back = {v: k for k, v in mbti_dict.items()}

df['IE'] = df['IE'].apply(lambda x: mbti_dict[x])
df['PJ'] = df['PJ'].apply(lambda x: mbti_dict[x])
df['SN'] = df['SN'].apply(lambda x: mbti_dict[x])
df['TF'] = df['TF'].apply(lambda x: mbti_dict[x])

df.head()

Unnamed: 0,IE,SN,TF,PJ,conscieniousness,agreeable,neuroticism,openness,extraversion
1,1,1,1,1,50.22031973,57.71273046,85.88187978,75.60278174,20.2788891
2,0,1,0,0,37.61441377,53.04986179,97.24779521,76.6872109,19.2923702
3,1,1,0,0,55.88302211,62.36175274,77.35532718,73.16256345,33.68039782
4,1,1,0,1,21.39582754,70.408671,89.56679917,72.92137874,56.05826003
5,0,1,0,1,15.7368388,47.37946064,96.28591317,69.24075453,55.7772194


In [13]:
# unify datatypes
df = df.apply(pd.to_numeric)
df.dtypes

IE                    int64
SN                    int64
TF                    int64
PJ                    int64
conscieniousness    float64
agreeable           float64
neuroticism         float64
openness            float64
extraversion        float64
dtype: object

In [14]:
# see how data look like
df.describe()

Unnamed: 0,IE,SN,TF,PJ,conscieniousness,agreeable,neuroticism,openness,extraversion
count,8488.0,8488.0,8488.0,8488.0,8488.0,8488.0,8488.0,8488.0,8488.0
mean,0.769321,0.861216,0.540528,0.396442,34.859276,50.606149,91.332269,69.594194,34.768475
std,0.421292,0.345742,0.498384,0.489187,17.656223,15.575845,10.306173,13.968651,16.157007
min,0.0,0.0,0.0,0.0,1.0,3.658302,4.0,2.0,1.0
25%,1.0,1.0,0.0,0.0,21.019192,39.437223,88.196674,60.609393,22.736694
50%,1.0,1.0,1.0,0.0,34.091767,50.857164,95.275233,71.218644,33.487382
75%,1.0,1.0,1.0,1.0,47.20429,61.806459,98.487059,80.074291,45.527647
max,1.0,1.0,1.0,1.0,95.165435,99.0,99.0,99.0,94.24259


In [15]:
# the text of mbti dataset were collected from posts in 'personality cafe' forum, therefore,
# content in this type of psychology forum is highly possible relating to words with neuroticism.
# for a less noise distribution, i decided to delete some of samples that has neuroticism 90+
df_red = df.head(8200)
df_red = df_red[df_red['neuroticism']<80]
df_red.info()
df_temp = df.iloc[8201:]
df_red = df_red.append(df_temp)
df_red.info()
# 974/1261 around 0.77

<class 'pandas.core.frame.DataFrame'>
Int64Index: 974 entries, 3 to 8358
Data columns (total 9 columns):
IE                  974 non-null int64
SN                  974 non-null int64
TF                  974 non-null int64
PJ                  974 non-null int64
conscieniousness    974 non-null float64
agreeable           974 non-null float64
neuroticism         974 non-null float64
openness            974 non-null float64
extraversion        974 non-null float64
dtypes: float64(5), int64(4)
memory usage: 76.1 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1261 entries, 3 to 8671
Data columns (total 9 columns):
IE                  1261 non-null int64
SN                  1261 non-null int64
TF                  1261 non-null int64
PJ                  1261 non-null int64
conscieniousness    1261 non-null float64
agreeable           1261 non-null float64
neuroticism         1261 non-null float64
openness            1261 non-null float64
extraversion        1261 non-null float64
dtypes:

In [16]:
df_red.describe()

Unnamed: 0,IE,SN,TF,PJ,conscieniousness,agreeable,neuroticism,openness,extraversion
count,1261.0,1261.0,1261.0,1261.0,1261.0,1261.0,1261.0,1261.0,1261.0
mean,0.791435,0.846947,0.50119,0.411578,41.032324,63.278637,73.751061,71.292087,41.539847
std,0.406444,0.360182,0.500197,0.492315,17.443387,14.534725,14.341582,13.948985,16.846683
min,0.0,0.0,0.0,0.0,3.159586,8.839602,4.0,14.567783,1.0
25%,1.0,1.0,0.0,0.0,28.228784,54.34557,66.709667,63.469826,29.410574
50%,1.0,1.0,1.0,0.0,41.355316,64.673337,74.205833,73.798465,40.984155
75%,1.0,1.0,1.0,1.0,52.803015,73.629338,79.243662,81.194365,52.875185
max,1.0,1.0,1.0,1.0,95.165435,99.0,99.0,97.464683,94.24259


In [20]:
# preparing training
features = df_red.iloc[:, 0:3]
labels = df_red.iloc[:, 4:8]
print('feature shape :', features.shape)
print('labels shape :', labels.shape)

feature shape : (1261, 3)
labels shape : (1261, 4)


In [21]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features, labels, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1008, 3)
(253, 3)
(1008, 4)
(253, 4)


In [22]:
# multioutput regression model
model = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
model.fit(X_train, y_train)
scores = model.score(X_test, y_test)
print('scores:', scores)

scores: -0.00284157159805


In [None]:
# sklearn helepr method for stacking ensemble
# stacking uses predictions of base classifiers as input for training to a second-level model.
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
SEED = 0
NFOLDS = 5
# cross validation, make k folds of data
kf = KFold(n_splits=NFOLDS, random_state=SEED)

class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    def predict(self, x):
        return self.clf.predict(x)
    def fit(self, x, y):
        return self.clf.fit(x,y)
    def feature_importances(self, x, y):
        print(self.clf.fit(x,y).feature_importances_)
    def score(self,x,y):
        return self.clf.score(x,y)
    
def get_oof(clf, x_train, y_train, x_test):
        oof_train = np.zeros((ntrain,))
        oof_test = np.zeros((ntest,))
        oof_test_skf = np.empty((NFOLDS, ntest))
        
        for i, (train_index, test_index) in kf.split(x_train):
            x_tr = x_train[train_index]
            y_tr = y_train[train_index]
            x_te = x_train[test_index]
            
            clf.train(x_tr, y_tr)
            
            oof_train[test_index] = clf.predict(x_te)
            oof_test_skf[i,:] = clf.predict(x_test)
            
        oof_test[:] = oof_test_skf.mean(axis=0)
        return oof_train.reshape(-1,1), oof_test.reshape(-1,1)

In [None]:
# preprocess the data
# 1. delete unrelated samples
# 2. transfer mbti types to integers
# Visualization of data
# model set up
# 1. train/test/cross-validation
# 
# training
# test

#most neuroticism score 90%+
#since the dataset is unnecessaryly huge, delete some of 90+, and remain some of work