In [1]:
import pandas as pd
data = pd.read_csv("mbti_1.csv")
print(data.shape)

(8675, 2)


In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['posts'], data['type'], test_size=0.2, random_state=42)
X_train

4080    'I loved All the Light We Cannot See by Anthon...
2614    'It depends. If I care about it, I fight and g...
5414    'Welcome home, sonny :laughing:|||Just because...
1039    That's really cool of you. I like it when anyo...
8294    'The duck is named Zeus.|||http://www.youtube....
                              ...                        
5734    'I have 2 cats and a chihuahua/pug mix. It's r...
5191    Ever since I can remember I have suffered/live...
5390    'I've known a couple of INFJ guys and they see...
860     'Even the loner gets lonely. I feel like it's ...
7270    Before reading the responses to this thread, I...
Name: posts, Length: 6940, dtype: object

In [3]:
import re
import spacy
from spacy.language import Language


pipeline = spacy.load('en_core_web_sm')

# http://emailregex.com/
email_re = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""

# replace = [ (pattern-to-replace, replacement),  ...]
replace = [
    (r"(http:).*?(\|\|\|)",r"\1"),
    (r"]*>(.*?)", r"\1"),  # Matches most URLs
    (email_re, "email"),            # Matches emails
    (r"(?<=\d),(?=\d)", ""),        # Remove commas in numbers
    (r"\d+", "number"),              # Map digits to special token 
    (r"[\t\n\r\*\.\@\,\-\/]", " "), # Punctuation and other junk
    (r"\s+", " "),                   # Stips extra whitespace
    (r"http:",r" "),
    (r"\|\|\|"," "),
    (r"(https:).*?(\|\|\|)",r"\1")
]

train_sentences = []
test_sentences = []
for i, d in enumerate(X_train):
    for repl in replace:
        d = re.sub(repl[0], repl[1], d)
    train_sentences.append(d)
for i, d in enumerate(X_test):
    for repl in replace:
        d = re.sub(repl[0], repl[1], d)
    test_sentences.append(d)

@Language.component("ng20")
def ng20_preprocess(doc):
    tokens = [token for token in doc 
                if not any((token.is_stop, token.is_punct))]
    tokens = [token.lemma_.lower().strip() for token in tokens]
    tokens = [token for token in tokens if token]
    return " ".join(tokens)
pipeline.add_pipe("ng20");
pipeline.analyze_pipes(pretty=True)

docs = [pipeline(d) for d in train_sentences]
test_docs = [pipeline(d) for d in test_sentences]

  from .autonotebook import tqdm as notebook_tqdm


[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   tok2vec           doc.tensor                                          False      
                                                                                     
1   tagger            token.tag                        tag_acc            False      
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                        sents_p                       
                                                       sents_r                       
                                                

In [27]:
pd.DataFrame(docs).to_csv("train_posts.csv",index=None)
pd.DataFrame(test_docs).to_csv("test_posts.csv",index=None)
pd.DataFrame(y_train).to_csv("train_type.csv",index=None)
pd.DataFrame(y_test).to_csv("test_type.csv",index=None)

Since the data is highly imbalanced, we break down the problem into building four classifiers to classify eight types of personality. 

In [10]:
train_label1 = y_train.apply(lambda x:x[0])
train_label2 = y_train.apply(lambda x:x[1])
train_label3 = y_train.apply(lambda x:x[2])
train_label4 = y_train.apply(lambda x:x[3])

test_label1 = y_test.apply(lambda x:x[0])
test_label2 = y_test.apply(lambda x:x[1])
test_label3 = y_test.apply(lambda x:x[2])
test_label4 = y_test.apply(lambda x:x[3])

P    4175
J    2765
Name: type, dtype: int64

In [13]:
possible_label3 = train_label3.unique()
label_dict3 = {}
for index, possible_label in enumerate(possible_label3):
    label_dict3[possible_label] = index
train_label3 = train_label3.replace(label_dict3)
test_label3 = test_label3.replace(label_dict3)