# EDA

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import plot_confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

SEED = 53188535

## There are three potential datasets for this project:

### ~9k Myers-Briggs Personality Type labeled comments from PersonalityCafe

In [4]:
cafe_df = pd.read_csv('data/mbti_1.csv')

In [None]:
cafe_df

### ~100k Myers-Briggs Personality Type labeled comments from PersonalityCafe and Google Big Query Reddit users. 
Posts are preprocessed texts:

- No punctuations, stopwords, URLs
- Lemmatization
- Reconstruct samples to be equal-sized chunks (500 words per sample)

In [None]:
both_df = pd.read_csv('data/MBTI 500.csv')

### ~1.7M Google Big Query of Reddit comments and their Myers-Briggs Personality Type

In [None]:
gbq_df = pd.read_csv('data/mbti_full_pull.csv')

## Personality Cafe
For the baseline, will use the Peronality Cafe Data.

In [None]:
# Creating list of all the variables for Myers_Briggs type for future use.
pred_lst = ['intp', 'intj', 'entp', 'entj', 'infj', 'infp', 'enfj', \
    'enfp', 'istj', 'isfj', 'estj', 'esfj', 'istp', 'isfp', 'estp', 'esfp']

pred_st = set(''.join(pred_lst))

In [5]:
cafe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [6]:
cafe_df.type.value_counts(normalize=True)

INFP    0.211182
INFJ    0.169452
INTP    0.150317
INTJ    0.125764
ENTP    0.078963
ENFP    0.077810
ISTP    0.038847
ISFP    0.031239
ENTJ    0.026628
ISTJ    0.023631
ENFJ    0.021902
ISFJ    0.019135
ESTP    0.010259
ESFP    0.005533
ESFJ    0.004841
ESTJ    0.004496
Name: type, dtype: float64

### Clean & Preprocess

In [7]:
cafe_clean = cafe_df.copy()

In [8]:
import re
import string

In [9]:
punc = string.punctuation

In [10]:
cafe_clean['type'] = cafe_clean.type.str.lower()

In [11]:
cafe_clean['clean_posts'] = cafe_clean.posts.str.lower()

In [12]:
def url_remove(post):
    return re.sub(r'http\S+', '', post)

def pipe_remove(post):
    return re.sub(r'[|]', ' ', post)

def punc_remove(post):
    return re.sub(r'[\'_:]', '', post)

def remove_dig_token(post):
    return [post[i] for i in range(len(post)) if post[i].isalpha()]

def remove_stopwords(post):
    sw = stopwords.words('english')
    return [post[i] for i in range(len(post)) if post[i] not in sw]

In [13]:
cafe_clean['clean_posts'] = cafe_clean['clean_posts'].apply(pipe_remove)

In [14]:
cafe_clean['clean_posts'] = cafe_clean['clean_posts'].apply(url_remove)

In [15]:
cafe_clean['clean_posts'] = cafe_clean['clean_posts'].apply(punc_remove)

In [16]:
pattern = r"(?u)\b\w\w+\b"

In [17]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)

In [18]:
cafe_clean['post_token'] = cafe_clean['clean_posts'].apply(tokenizer.tokenize)

In [19]:
cafe_clean['post_token'] = cafe_clean['post_token'].apply(remove_dig_token)

In [20]:
cafe_clean['post_token'] = cafe_clean['post_token'].apply(remove_stopwords)

In [21]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]

In [22]:
cafe_clean['post_token'] = cafe_clean['post_token'].apply(lemmatize_text)

In [23]:
def join_tokens(tokens):
    long_string = ' '.join(tokens)
    return long_string

In [24]:
cafe_clean['token_joined'] = cafe_clean['post_token'].apply(join_tokens)

In [25]:
cafe_clean['clean_posts'] = cafe_clean['post_token'].str.join(' ')

In [26]:
cafe_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8675 non-null   object
 1   posts         8675 non-null   object
 2   clean_posts   8675 non-null   object
 3   post_token    8675 non-null   object
 4   token_joined  8675 non-null   object
dtypes: object(5)
memory usage: 339.0+ KB


In [27]:
cafe_clean.head(5)

Unnamed: 0,type,posts,clean_posts,post_token,token_joined
0,infj,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,enfp intj moment sportscenter top ten play pra...,"[enfp, intj, moment, sportscenter, top, ten, p...",enfp intj moment sportscenter top ten play pra...
1,entp,'I'm finding the lack of me in these posts ver...,im finding lack post alarming sex boring posit...,"[im, finding, lack, post, alarming, sex, borin...",im finding lack post alarming sex boring posit...
2,intp,'Good one _____ https://www.youtube.com/wat...,good one course say know thats blessing curse ...,"[good, one, course, say, know, thats, blessing...",good one course say know thats blessing curse ...
3,intj,"'Dear INTP, I enjoyed our conversation the o...",dear intp enjoyed conversation day esoteric ga...,"[dear, intp, enjoyed, conversation, day, esote...",dear intp enjoyed conversation day esoteric ga...
4,entj,'You're fired.|||That's another silly misconce...,youre fired thats another silly misconception ...,"[youre, fired, thats, another, silly, misconce...",youre fired thats another silly misconception ...


### Adding binary targets

In [28]:
cafe_clean['i/e'] = cafe_clean['type'].map(lambda x: x[0])
cafe_clean['n/s'] = cafe_clean['type'].map(lambda x: x[1])
cafe_clean['t/f'] = cafe_clean['type'].map(lambda x: x[2])
cafe_clean['p/j'] = cafe_clean['type'].map(lambda x: x[3])


In [29]:
print(cafe_clean['i/e'].value_counts())
print(cafe_clean['n/s'].value_counts())
print(cafe_clean['t/f'].value_counts())
print(cafe_clean['p/j'].value_counts())

i    6676
e    1999
Name: i/e, dtype: int64
n    7478
s    1197
Name: n/s, dtype: int64
f    4694
t    3981
Name: t/f, dtype: int64
p    5241
j    3434
Name: p/j, dtype: int64


In [30]:
cafe_clean

Unnamed: 0,type,posts,clean_posts,post_token,token_joined,i/e,n/s,t/f,p/j
0,infj,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,enfp intj moment sportscenter top ten play pra...,"[enfp, intj, moment, sportscenter, top, ten, p...",enfp intj moment sportscenter top ten play pra...,i,n,f,j
1,entp,'I'm finding the lack of me in these posts ver...,im finding lack post alarming sex boring posit...,"[im, finding, lack, post, alarming, sex, borin...",im finding lack post alarming sex boring posit...,e,n,t,p
2,intp,'Good one _____ https://www.youtube.com/wat...,good one course say know thats blessing curse ...,"[good, one, course, say, know, thats, blessing...",good one course say know thats blessing curse ...,i,n,t,p
3,intj,"'Dear INTP, I enjoyed our conversation the o...",dear intp enjoyed conversation day esoteric ga...,"[dear, intp, enjoyed, conversation, day, esote...",dear intp enjoyed conversation day esoteric ga...,i,n,t,j
4,entj,'You're fired.|||That's another silly misconce...,youre fired thats another silly misconception ...,"[youre, fired, thats, another, silly, misconce...",youre fired thats another silly misconception ...,e,n,t,j
...,...,...,...,...,...,...,...,...,...
8670,isfp,'https://www.youtube.com/watch?v=t8edHB_h908||...,ixfp always think cat fi doms reason especiall...,"[ixfp, always, think, cat, fi, doms, reason, e...",ixfp always think cat fi doms reason especiall...,i,s,f,p
8671,enfp,'So...if this thread already exists someplace ...,thread already exists someplace else heck dele...,"[thread, already, exists, someplace, else, hec...",thread already exists someplace else heck dele...,e,n,f,p
8672,intp,'So many questions when i do these things. I ...,many question thing would take purple pill pic...,"[many, question, thing, would, take, purple, p...",many question thing would take purple pill pic...,i,n,t,p
8673,infp,'I am very conflicted right now when it comes ...,conflicted right come wanting child honestly m...,"[conflicted, right, come, wanting, child, hone...",conflicted right come wanting child honestly m...,i,n,f,p


### Baseline Models

In [31]:
X = cafe_clean['token_joined']
y = cafe_clean['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)

In [32]:
count_vc = CountVectorizer(ngram_range=(1,2))

In [33]:
X_train_count = count_vc.fit_transform(X_train)
X_test_count = count_vc.transform(X_test)

In [34]:
sgd = SGDClassifier(class_weight='balanced', random_state=SEED)

sgd.fit(X_train_count, y_train)

cv_score = cross_val_score(sgd, X_train_count, y_train, cv=5)
cv_score_mean = round(np.mean(cv_score), 4)

y_pred = sgd.predict(X_test_count)
acc_score = accuracy_score(y_pred, y_test)

print(f"CV: {cv_score_mean}, A: {acc_score}")

CV: 0.6156, A: 0.5984324573536192


In [None]:
y_test.value_counts(normalize=True)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
plot_confusion_matrix(sgd, X_test_count, y_test, ax=ax, normalize='true')

In [None]:
# BASELINE
# ---------
# All classes ~60%
# i/e ~85%
# n/s ~90%
# p/j ~77%
# t/f ~85%

### Understample test set for binary

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
train_set, test_set = train_test_split(cafe_clean, random_state=SEED)

X_train = train_set['token_joined']
X_train = np.array(X_train).reshape(-1, 1)

y_train = train_set['t/f']
y_train = np.array(y_train).reshape(-1, 1)


# instantiating the random undersampler
rus = RandomUnderSampler() 

# resampling training set X & y
X_rus, y_rus = rus.fit_resample(X_train, y_train)

# new class distribution
print(np.unique(y_train, return_counts=True))
print(np.unique(y_rus, return_counts=True))



In [None]:
X_rus.reshape(-1).shape

In [None]:
X_rus = pd.Series(X_rus.reshape(-1))
y_rus = pd.Series(y_rus.reshape(-1))

In [None]:
X_train_count = count_vc.fit_transform(X_rus)
X_test_count = count_vc.transform(test_set['token_joined'])

In [None]:
sgd = SGDClassifier(class_weight='balanced', random_state=SEED)

sgd.fit(X_train_count, y_rus)

cv_score = cross_val_score(sgd, X_train_count, y_rus, cv=5)
cv_score_mean = round(np.mean(cv_score), 4)

y_pred = sgd.predict(X_test_count)
acc_score = accuracy_score(y_pred, test_set['t/f'])

print(f"CV: {cv_score_mean}, A: {acc_score}")

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
plot_confusion_matrix(sgd, X_test_count, test_set['t/f'], ax=ax)

In [None]:
# Undersampling
# ---------
# i/e ~81%
# n/s ~83%
# p/j ~76%
# t/f ~83%

## Both DF

Comparing baseline models to this other dataset

In [None]:
both_clean = both_df.copy()

In [None]:
both_clean['type'] = both_clean['type'].str.lower()

In [None]:
both_clean['type'].value_counts()

In [None]:
both_clean['i/e'] = both_clean['type'].map(lambda x: x[0])
both_clean['n/s'] = both_clean['type'].map(lambda x: x[1])
both_clean['t/f'] = both_clean['type'].map(lambda x: x[2])
both_clean['p/j'] = both_clean['type'].map(lambda x: x[3])


In [None]:
print(both_clean['i/e'].value_counts())
print(both_clean['n/s'].value_counts())
print(both_clean['t/f'].value_counts())
print(both_clean['p/j'].value_counts())

In [None]:
both_clean.columns

In [None]:
X = both_clean['posts']
y = both_clean['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)

In [None]:
count_vc = CountVectorizer(ngram_range=(1,2))

In [None]:
X_train_count = count_vc.fit_transform(X_train)
X_test_count = count_vc.transform(X_test)

In [None]:
sgd = SGDClassifier(class_weight='balanced', random_state=SEED)

sgd.fit(X_train_count, y_train)

cv_score = cross_val_score(sgd, X_train_count, y_train, cv=5)
cv_score_mean = round(np.mean(cv_score), 4)

y_pred = sgd.predict(X_test_count)
acc_score = accuracy_score(y_pred, y_test)

print(f"CV: {cv_score_mean}, A: {round(acc_score, 4)}")

In [None]:
y_test.value_counts(normalize=True)

In [None]:
fig, ax = plt.subplots(figsize=(15,15))
plt.rcParams.update({'font.size': 22})
plt.rc('xtick', labelsize=22) 
plt.rc('ytick', labelsize=22) 
plot_confusion_matrix(sgd, X_test_count, y_test, ax=ax, normalize='true')

In [None]:
# BASELINE (100k set)
# ---------
# All ~82%
# i/e ~85%
# n/s ~96%
# p/j ~86%
# t/f ~92%

# BASELINE (Cafe 9k set)
# ---------
# All classes ~60%
# i/e ~85%
# n/s ~90%
# p/j ~77%
# t/f ~85%

# Undersampling (Cafe)
# ---------
# i/e ~81%
# n/s ~83%
# p/j ~76%
# t/f ~83%



## Google Big Query

In [None]:
gbq_clean = gbq_df.copy()

In [None]:
gbq_clean['author_flair_text'] = gbq_clean['author_flair_text'].str.lower()
gbq_clean['subreddit'] = gbq_clean['subreddit'].str.lower()

In [None]:
whitelist
regex_mbti = '|'.join(["(" + i + ")" for i in whitelist])

In [None]:
regex_mbti

In [None]:
for mbti in whitelist:
    gbq_clean['author_flair_text'] = gbq_clean['author_flair_text'].mask(gbq_clean['author_flair_text'].str.match("(?:" + mbti + ")"), mbti)

In [None]:
# gbq_clean['author_flair_text'] = gbq_clean['author_flair_text'].mask(gbq_clean['author_flair_text'].str.match(r"\b-infj", case=False), 'infj')

In [None]:
gbq_clean = gbq_clean[gbq_clean['author_flair_text'].str.match(regex_mbti)]

In [None]:
gbq_clean['author_flair_text'].value_counts().to_frame('counts')

In [None]:
gbq_clean['i/e'] = gbq_clean['author_flair_text'].map(lambda x: x[0])
gbq_clean['n/s'] = gbq_clean['author_flair_text'].map(lambda x: x[1])
gbq_clean['t/f'] = gbq_clean['author_flair_text'].map(lambda x: x[2])
gbq_clean['p/j'] = gbq_clean['author_flair_text'].map(lambda x: x[3])

In [None]:
gbq_clean.columns[-4:]

In [None]:
for column in gbq_clean.columns[-4:]:
    plt.bar(gbq_clean[column].value_counts(normalize=True).index, gbq_clean[column].value_counts(normalize=True).values)

In [None]:
gbq_clean

In [None]:
from text_clean import preprocess

In [None]:
tc = preprocess()

In [None]:
gbq_clean = gbq_clean.dropna()

In [None]:
gbq_clean['body'] = gbq_clean['body'].str.lower()

In [None]:
gbq_clean

In [None]:
gbq_clean['clean_posts'] = gbq_clean['body'].apply(pipe_remove)
gbq_clean['clean_posts'] = gbq_clean['body'].apply(url_remove)
gbq_clean['clean_posts'] = gbq_clean['body'].apply(punc_remove)

In [None]:
pattern = r"(?u)\b\w\w+\b"

In [None]:
tokenizer = RegexpTokenizer(pattern)

In [None]:
gbq_clean['post_token'] = gbq_clean['clean_posts'].apply(tokenizer.tokenize)

In [None]:
gbq_clean['post_token'] = gbq_clean['post_token'].apply(remove_dig_token)

In [None]:
gbq_clean

In [None]:
gbq_clean['post_token'] = gbq_clean['post_token'].apply(remove_stopwords)

In [None]:
gbq_clean['post_token'] = gbq_clean['post_token'].apply(lemmatize_text)

In [None]:
gbq_clean = gbq_clean[gbq_clean['post_token'].apply(lambda x: len(x) > 5)]

In [None]:
gbq_clean['token_joined'] = gbq_clean['post_token'].apply(join_tokens)

In [None]:
gbq_clean

In [None]:
gbq_clean.to_pickle("./clean_df.pkl")  

## Modeling

In [None]:
df_modeling = pd.read_pickle('./clean_df.pkl')

In [None]:
df_modeling

In [None]:
cafe_clean

In [None]:
df_modeling

In [None]:
df_modeling = df_modeling.rename(columns={'author_flair_text': 'type', 'body': 'posts'})

In [None]:
df_modeling = pd.concat([df_modeling, cafe_clean])

In [None]:
df_modeling['length'] = df_modeling['post_token'].apply(lambda x: len(x))

In [None]:
df_modeling.describe()

In [None]:
df_modeling_two = df_modeling[df_modeling['post_token'].apply(lambda x: len(x) > 50)]

In [None]:
df_modeling_two.to_pickle("./finalmodeling_df.pkl")  

In [None]:
df_modeling_two['type'].value_counts(normalize=False)

In [None]:
df_modeling.isna().sum()

In [None]:
X = df_modeling_two['token_joined']
y = df_modeling_two['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)

In [None]:
count_vc = CountVectorizer(ngram_range=(1,2))


In [None]:
tfidf_vc = TfidfVectorizer(ngram_range=(1,2))

In [None]:
X_train.shape

In [None]:
X_train_count = tfidf_vc.fit_transform(X_train)
X_test_count = tfidf_vc.transform(X_test)

In [None]:
tfidf_vc.vocabulary_

In [None]:
sgd = SGDClassifier(random_state=SEED)

sgd.fit(X_train_count, y_train)

cv_score = cross_val_score(sgd, X_train_count, y_train, cv=5)
cv_score_mean = round(np.mean(cv_score), 4)

y_pred = sgd.predict(X_test_count)
acc_score = accuracy_score(y_pred, y_test)

print(f"CV: {cv_score_mean}, A: {round(acc_score, 4)}")

In [None]:
# import pickle

In [None]:
# pickle.dump(sgd, open('baseline_sgd.pkl', 'wb'))

In [None]:
y_test.value_counts(normalize=True)

In [None]:
fig, ax = plt.subplots(figsize=(15,15))
plt.rcParams.update({'font.size': 10})
plt.rc('xtick', labelsize=22) 
plt.rc('ytick', labelsize=22) 
plot_confusion_matrix(sgd, X_test_count, y_test, ax=ax, normalize='true')