# EDA

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## There are three potential datasets for this project:

### ~9k Myers-Briggs Personality Type labeled comments from PersonalityCafe

In [3]:
cafe_df = pd.read_csv('data/mbti_1.csv')

### ~100k Myers-Briggs Personality Type labeled comments from PersonalityCafe and Google Big Query Reddit users. 
Posts are preprocessed texts:

- No punctuations, stopwords, URLs
- Lemmatization
- Reconstruct samples to be equal-sized chunks (500 words per sample)

In [5]:
both_df = pd.read_csv('data/MBTI 500.csv')

### ~1.7M Google Big Query of Reddit comments and their Myers-Briggs Personality Type

In [7]:
gbq_df = pd.read_csv('data/mbti_full_pull.csv')

In [8]:
gbq_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1794016 entries, 0 to 1794015
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   author_flair_text  object
 1   body               object
 2   subreddit          object
dtypes: object(3)
memory usage: 41.1+ MB


In [9]:
gbq_df.head()

Unnamed: 0,author_flair_text,body,subreddit
0,INTJ,Knowing you're in INTJ is a tool for you to us...,intj
1,INTJ,You are truly an enlightened mastermind.,intj
2,"INFJ, 26F",You should :) it will help if you have a down ...,infj
3,INTP,I watch a bit of everything (including hentai)...,INTP
4,INTJ,I don't know if I would count this as a pet pe...,intj


In [17]:
gbq_df['subreddit'].value_counts()

INTP              419700
intj              296101
mbti              253602
entp              178379
infj              164662
                   ...  
Instagram              1
BF_Hardline            1
tall                   1
RAofLittleness         1
Maplestory             1
Name: subreddit, Length: 520, dtype: int64

In [18]:
print(gbq_df['author_flair_text'].value_counts())

INTP                  365646
INTJ                  323224
ENFP                   88334
ENTP                   73481
INFJ                   69730
                       ...  
ENTP: The Droideka         1
[F] [INTP]                 1
INTP-A/F/17                1
INFJ - Male                1
14M INFJ                   1
Name: author_flair_text, Length: 8702, dtype: int64


In [19]:
gbq_clean = gbq_df.copy()

In [21]:
gbq_clean['author_flair_text'] = gbq_clean['author_flair_text'].str.lower()

In [22]:
gbq_clean['subreddit'] = gbq_clean['subreddit'].str.lower()

In [28]:
gbq_clean.author_flair_text.value_counts()

intp                   366510
intj                   323732
enfp                    88438
entp                    80837
infj                    71681
                        ...  
infj / 19 / f / uk          1
wannabe entpreneur          1
infj | m | 37               1
infj / f / 26               1
infp: just visiting         1
Name: author_flair_text, Length: 8337, dtype: int64

In [25]:
whitelist = ['intp', 'intj', 'entp', 'entj', 'infj', 'infp', 'enfj', 'enfp', 'istj', 'isfj', 'estj', 'esfj', 'istp', 'isfp', 'estp', 'esfp']

In [42]:
set(''.join(whitelist))

{'e', 'f', 'i', 'j', 'n', 'p', 's', 't'}

In [50]:
gbq_clean.loc[0:30, 'author_flair_text']

0                                                  intj
1                                                  intj
2                                             infj, 26f
3                                                  intp
4                                                  intj
5                                                  intp
6                                                  intj
7                                             infj 27 f
8                                                  intj
9                                                  intj
10                                                 intp
11                                              35mentp
12                                                 enfp
13                                                 entj
14                                             infj/4w3
15                                                 enfp
16                                             infj 28m
17    atheistpuppy 4270-1878-6970 [rock:nosepass

In [49]:
gbq_clean['author_flair_text'].str.extract(r"([efijnpst]+)").value_counts()[0:20]

intp    441827
intj    356245
infj    189338
entp    186172
infp    175145
enfp     97093
istp     48477
entj     43456
f        31214
n        25948
s        24059
estp     21142
enfj     20659
istj     16587
ttp      15041
e        11884
isfp     11399
i         8140
t         7987
esfp      7397
dtype: int64

In [40]:
gbq_clean['author_flair_text'].map(lambda x: x.strip(' ') if x in whitelist else 'n/a').value_counts()

n/a     710382
intp    366510
intj    323732
enfp     88438
entp     80837
infj     71681
infp     43329
istp     35316
entj     18705
istj     13958
isfp      9167
estp      9107
enfj      8253
esfp      4925
isfj      4266
estj      3557
esfj      1853
Name: author_flair_text, dtype: int64