In [1]:
import numpy as np
import pandas as pd
import pickle
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#import dataset and see how it looks like
df = pd.read_csv("../dataset/mbti_big5_clean.csv")
print(df.shape)
df.head()


(8672, 6)


Unnamed: 0,type,conscieniousness,agreeable,extraversion,neuroticism,openness
0,INFJ,43.52417819,58.68915806,an,95.56853657,68.2717645
1,INFJ,50.22031973,57.71273046,20.2788891,85.88187978,75.60278174
2,ENTP,37.61441377,53.04986179,19.2923702,97.24779521,76.6872109
3,INTP,55.88302211,62.36175274,33.68039782,77.35532718,73.16256345
4,INTJ,21.39582754,70.408671,56.05826003,89.56679917,72.92137874


In [3]:
# some sample has texts data
mask = df['extraversion'].str.contains('^\d*\.?\d*$', na = False)
df = df[mask]
df.head()

Unnamed: 0,type,conscieniousness,agreeable,extraversion,neuroticism,openness
1,INFJ,50.22031973,57.71273046,20.2788891,85.88187978,75.60278174
2,ENTP,37.61441377,53.04986179,19.2923702,97.24779521,76.6872109
3,INTP,55.88302211,62.36175274,33.68039782,77.35532718,73.16256345
4,INTJ,21.39582754,70.408671,56.05826003,89.56679917,72.92137874
5,ENTJ,15.7368388,47.37946064,55.7772194,96.28591317,69.24075453


In [4]:
# remove samples has text data on other traits
df = df[df['conscieniousness'].str.contains('^\d*\.?\d*$', na = False)]
df = df[df['agreeable'].str.contains('^\d*\.?\d*$', na = False)]
df = df[df['neuroticism'].str.contains('^\d*\.?\d*$', na = False)]
df = df[df['openness'].str.contains('^\d*\.?\d*$', na = False)]

In [5]:
# transfer mbti to integers
mbti_dict = {'ISTJ':1, 'ISTP':2, 'ISFJ':3, 'ISFP':4, 'INTJ':5, 'INTP':6, 'INFJ':7, 'INFP':8,
             'ESTJ':9, 'ESTP':10, 'ESFJ':11, 'ESFP':12, 'ENTJ':13, 'ENTP':14, 'ENFJ':15, 'ENFP':16}
mbti_dict_back = {v: k for k, v in mbti_dict.items()}

df['type'] = df['type'].apply(lambda x: mbti_dict[x])
df.head()

Unnamed: 0,type,conscieniousness,agreeable,extraversion,neuroticism,openness
1,7,50.22031973,57.71273046,20.2788891,85.88187978,75.60278174
2,14,37.61441377,53.04986179,19.2923702,97.24779521,76.6872109
3,6,55.88302211,62.36175274,33.68039782,77.35532718,73.16256345
4,5,21.39582754,70.408671,56.05826003,89.56679917,72.92137874
5,13,15.7368388,47.37946064,55.7772194,96.28591317,69.24075453


In [6]:
# unify datatypes
df = df.apply(pd.to_numeric)
df.dtypes

type                  int64
conscieniousness    float64
agreeable           float64
extraversion        float64
neuroticism         float64
openness            float64
dtype: object

In [7]:
# see how data look like
print(df.groupby('type').agg('mean'))
print(df.groupby('type').agg('std'))

      conscieniousness  agreeable  extraversion  neuroticism   openness
type                                                                   
1            41.730087  50.860979     34.038727    89.025383  65.077677
2            27.808481  48.100595     36.118409    91.901227  71.095501
3            40.801885  51.077550     32.889770    90.464744  66.273595
4            34.897305  53.886507     34.763556    89.801703  72.359709
5            36.332027  51.326485     35.286357    90.458846  67.900300
6            32.158812  50.642141     34.474338    90.423509  69.767881
7            37.674599  50.917706     33.883986    91.106848  69.182739
8            35.507266  48.731156     31.368062    92.426896  71.521649
9            42.122198  56.118490     36.993981    90.735260  64.018426
10           24.792950  51.452587     39.613595    92.377139  71.204205
11           41.952450  58.713643     36.821596    91.048323  64.395185
12           29.729929  57.112214     38.753065    90.655316  76

In [8]:
# the text of mbti dataset were collected from posts in 'personality cafe' forum, therefore,
# content in this type of psychology forum is highly possible relating to words with neuroticism.
# for a less noise distribution, i decided to delete some of samples that has neuroticism 90+
df_red = df.head(8000)
df_red = df_red[df_red['neuroticism']<80]
df_red.info()
df_temp = df.iloc[8001:]
df_red = df_red.append(df_temp)
df_red.info()
# 961/1448 around 0.7

<class 'pandas.core.frame.DataFrame'>
Int64Index: 961 entries, 3 to 8175
Data columns (total 6 columns):
type                961 non-null int64
conscieniousness    961 non-null float64
agreeable           961 non-null float64
extraversion        961 non-null float64
neuroticism         961 non-null float64
openness            961 non-null float64
dtypes: float64(5), int64(1)
memory usage: 52.6 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1448 entries, 3 to 8671
Data columns (total 6 columns):
type                1448 non-null int64
conscieniousness    1448 non-null float64
agreeable           1448 non-null float64
extraversion        1448 non-null float64
neuroticism         1448 non-null float64
openness            1448 non-null float64
dtypes: float64(5), int64(1)
memory usage: 79.2 KB


In [9]:
print(df_red.groupby('type').agg('mean'))
print(df_red.groupby('type').agg('std'))

      conscieniousness  agreeable  extraversion  neuroticism   openness
type                                                                   
1            42.463145  62.022803     41.726775    75.035623  69.419456
2            33.256361  61.949451     40.174606    77.039217  71.484635
3            39.684852  65.028596     41.350999    71.130012  71.596869
4            37.097229  63.507303     40.402421    73.894163  74.220937
5            41.865362  60.366819     39.867396    75.469925  69.116419
6            39.935593  61.762293     40.911610    74.660269  70.225174
7            40.928856  62.808752     40.508283    76.115244  72.239287
8            40.318154  59.568035     37.645850    77.771208  73.672982
9            47.221480  62.147666     36.643012    79.929114  70.141423
10           28.121199  62.827843     39.534078    77.239977  71.580613
11           49.951791  67.038756     42.362289    84.682389  67.317374
12           35.627939  64.018578     47.842211    76.487859  74

In [None]:
# preparing training


In [10]:
# preprocess the data
# 1. delete unrelated samples
# 2. transfer mbti types to integers
# Visualization of data
# model set up
# 1. train/test/cross-validation
# 
# training
# test

#most neuroticism score 90%+
#since the dataset is unnecessaryly huge, delete some of 90+, and remain some of work