In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv("mbti_1.csv")
posts = data['posts']
type = data['type']


In [3]:
import re
import spacy
from spacy.language import Language


pipeline = spacy.load('en_core_web_sm')

# http://emailregex.com/
email_re = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""

# replace = [ (pattern-to-replace, replacement),  ...]
replace = [
    (r"(http:).*?(\|\|\|)",r"\1"),
    (r"]*>(.*?)", r"\1"),  # Matches most URLs
    (email_re, "email"),            # Matches emails
    (r"(?<=\d),(?=\d)", ""),        # Remove commas in numbers
    (r"\d+", "number"),              # Map digits to special token 
    (r"[\t\n\r\*\.\@\,\-\/]", " "), # Punctuation and other junk
    (r"\s+", " "),                   # Stips extra whitespace
    (r"http:",r" "),
    (r"\|\|\|"," "),
    (r"(https:).*?(\|\|\|)",r"\1")
]

sentences = []
for i, d in enumerate(posts):
    for repl in replace:
        d = re.sub(repl[0], repl[1], d)
    sentences.append(d)

@Language.component("ng20")
def ng20_preprocess(doc):
    tokens = [token for token in doc 
                if not any((token.is_stop, token.is_punct))]
    tokens = [token.lemma_.lower().strip() for token in tokens]
    tokens = [token for token in tokens if token]
    return " ".join(tokens)
pipeline.add_pipe("ng20");
pipeline.analyze_pipes(pretty=True)

docs = [pipeline(d) for d in sentences]

  from .autonotebook import tqdm as notebook_tqdm


[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   tok2vec           doc.tensor                                          False      
                                                                                     
1   tagger            token.tag                        tag_acc            False      
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                        sents_p                       
                                                       sents_r                       
                                                

In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
def compound(x):
    xx = sia.polarity_scores(x)
    return xx['compound']
data['value'] = np.array(docs)
data['value'] = data['value'].apply(lambda x:compound(x))
data['value']

0       0.9857
1       0.9991
2       0.9985
3       0.9987
4       0.9847
         ...  
8670    0.9932
8671    0.9998
8672    0.9823
8673    0.9995
8674    0.9991
Name: value, Length: 8675, dtype: float64

In [14]:
def sia_type(x):
    xx = sia.polarity_scores(x)
    xxx = {k: v for k, v in sorted(xx.items(), key=lambda item: item[1])}
    xxx.pop("compound")
    return list(xxx.keys())[0]
data['sia'] = np.array(docs)
data['sia'] = data['sia'].apply(lambda x: sia_type(x))

In [16]:
data['posts'] = np.array(docs)
data.to_csv("mbti_new.csv",index=None)

In [18]:
data

Unnamed: 0,type,posts,value,sia
0,INFJ,enfp intj moments https www youtube com watch?...,0.9857,neg
1,ENTP,find lack post alarming sex boring position ex...,0.9991,neg
2,INTP,good https www youtube com watch?v = fhigbolff...,0.9985,neg
3,INTJ,dear intp enjoy conversation day esoteric gabb...,0.9987,neg
4,ENTJ,fire silly misconception approach logically go...,0.9847,neg
...,...,...,...,...
8670,ISFP,https www youtube com watch?v = tnumberedhb_hn...,0.9932,neg
8671,ENFP,thread exist someplace ooop guess look hard st...,0.9998,neg
8672,INTP,question thing purple pill pick win lottery nu...,0.9823,neg
8673,INFP,conflicted right come want child honestly mate...,0.9995,neg


In [20]:
data['label1'] = data['type'].apply(lambda x:x[0])
data['label2'] = data['type'].apply(lambda x:x[1])
data['label3'] = data['type'].apply(lambda x:x[2])
data['label4'] = data['type'].apply(lambda x:x[3])

In [23]:
def rep_label(label):
    possible_label = label.unique()
    label_dict = {}
    for index, possible_label in enumerate(possible_label):
        label_dict[possible_label] = index
    return label.replace(label_dict)
data['label1_new'] = rep_label(data['label1'])
data['label2_new'] = rep_label(data['label2'])
data['label3_new'] = rep_label(data['label3'])
data['label4_new'] = rep_label(data['label4'])

In [25]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
X = data[['label1_new','label2_new','label3_new','label4_new']]
y = data['value']
model.fit(X,y)

In [26]:
model.coef_

array([ 0.01804573,  0.00986198, -0.04796216, -0.01779035])

In [27]:
model.intercept_

0.9740338283374098

In [29]:
model.score(X,y)

0.00796617694663626

In [28]:
data

Unnamed: 0,type,posts,value,sia,label1,label2,label3,label4,label1_new,label2_new,label3_new,label4_new
0,INFJ,enfp intj moments https www youtube com watch?...,0.9857,neg,I,N,F,J,0,0,0,0
1,ENTP,find lack post alarming sex boring position ex...,0.9991,neg,E,N,T,P,1,0,1,1
2,INTP,good https www youtube com watch?v = fhigbolff...,0.9985,neg,I,N,T,P,0,0,1,1
3,INTJ,dear intp enjoy conversation day esoteric gabb...,0.9987,neg,I,N,T,J,0,0,1,0
4,ENTJ,fire silly misconception approach logically go...,0.9847,neg,E,N,T,J,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8670,ISFP,https www youtube com watch?v = tnumberedhb_hn...,0.9932,neg,I,S,F,P,0,1,0,1
8671,ENFP,thread exist someplace ooop guess look hard st...,0.9998,neg,E,N,F,P,1,0,0,1
8672,INTP,question thing purple pill pick win lottery nu...,0.9823,neg,I,N,T,P,0,0,1,1
8673,INFP,conflicted right come want child honestly mate...,0.9995,neg,I,N,F,P,0,0,0,1
