In [4]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import string
import torch
import pickle

In [5]:
# load dataset
df_text=pd.read_csv("../dataset/mbti_1.csv",index_col='type', encoding='utf-8')
print(df_text.shape)
print(df_text[0:5])

(8675, 1)
                                                  posts
type                                                   
INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
ENTP  'I'm finding the lack of me in these posts ver...
INTP  'Good one  _____   https://www.youtube.com/wat...
INTJ  'Dear INTP,   I enjoyed our conversation the o...
ENTJ  'You're fired.|||That's another silly misconce...


In [6]:
# preprocess text
text_ls = df_text.posts.tolist()
        
def cleaner(text):
    # remove URL
    text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', text, flags=re.MULTILINE) 
    #replace |||
    text = text.replace('|||',"")
    # remove punctuation
    text = re.sub(r'[^\w\s]',' ',text)
    #lowercase every character
    text = text.lower()
    # remove white space
    text=re.sub('\s+', ' ', text).strip()
                
    return text
    
texts = [cleaner(text) for text in text_ls]
# validate preprocessing
print(texts[0])

and intj moments sportscenter not top ten plays prankswhat has been the most life changing experience in your life on repeat for most of today may the perc experience immerse you the last thing my infj friend posted on his facebook before committing suicide the next day rest in peace enfj7 sorry to hear of your distress it s only natural for a relationship to not be perfection all the time in every moment of existence try to figure the hard times as times of growth as 84389 84390 welcome and stuff game set match prozac wellbrutin at least thirty minutes of moving your legs and i don t mean moving them while sitting in your same desk chair weed in moderation maybe try edibles as a healthier alternative basically come up with three items you ve determined that each type or whichever types you want to do would more than likely use given each types cognitive functions and whatnot when left by all things in moderation sims is indeed a video game and a good one at that note a good one at tha

In [7]:
from collections import Counter

word_count=Counter()
for text in texts:
    word_count.update(text.split(" "))
    
print(len(word_count))

130463


In [8]:
# create a vocabulary dictionary and transfer words to word representation
vocab = sorted(word_count, key=word_count.get, reverse=True)
vocab_to_int = {word: num for num, word in enumerate(vocab, 1)}

text_rep = []
for text in texts:
    text_rep.append([vocab_to_int[word] for word in text.split()])
    
print(text_rep[0])

[5, 131, 1320, 54993, 24, 697, 1866, 2114, 54994, 92, 74, 2, 87, 103, 1459, 289, 11, 41, 103, 26, 2218, 16, 87, 7, 366, 199, 2, 729, 289, 12334, 6, 2, 242, 114, 12, 128, 145, 576, 26, 136, 1045, 186, 5945, 2156, 2, 467, 183, 778, 11, 1341, 54995, 279, 3, 418, 7, 41, 7214, 8, 14, 89, 825, 16, 4, 240, 3, 24, 22, 3975, 45, 2, 62, 11, 221, 478, 7, 1531, 155, 3, 527, 2, 203, 246, 28, 246, 7, 2434, 28, 54996, 54997, 300, 5, 275, 492, 711, 1224, 12622, 54998, 44, 259, 6747, 1003, 7, 1085, 41, 3315, 5, 1, 37, 13, 162, 1085, 66, 171, 1114, 11, 41, 132, 2821, 3878, 1992, 11, 6971, 152, 155, 27506, 28, 4, 5946, 2224, 519, 215, 63, 18, 556, 3462, 6, 53, 2800, 9, 315, 94, 35, 6572, 223, 6, 84, 3, 36, 51, 49, 95, 409, 225, 635, 315, 223, 640, 348, 5, 4362, 42, 508, 75, 45, 80, 11, 6971, 5245, 10, 1037, 4, 463, 492, 5, 4, 79, 47, 44, 9, 937, 4, 79, 47, 44, 9, 10, 842, 1484, 11, 9, 1, 60, 24, 356, 10108, 2, 677, 7, 111, 635, 11293, 607, 172, 33, 133, 41, 420, 463, 564, 1142, 63, 5, 33, 23, 41, 96, 951

In [9]:
# since each data has different length of words, need to uniform all feature length
lengths = [len(rep) for rep in text_rep]
print(sum(lengths)/len(lengths))

1311.4900288184438


In [17]:
# let's set the length as 500
sample_len = 500
features =np.zeros((len(text_rep),sample_len),dtype=int)
for i, sample in enumerate(text_rep):
    # fill 0 for samples has length < 1200
    features[i, -len(sample):] = np.array(sample)[:sample_len]
print(features[1])
print(features.shape)

[    1    20   761     2   636     7    19    11   180   520    64 17963
   494    34    22   766    32     8    14    11     2   132  1214   237
    16   396    19     5    12   979    23   603    11    27  1119   139
    46    17     3  7604   225 30472     5 16064    55   257    13   250
   721   212   713     3   492   582   510   210  5178     9    14    45
     8   652    95    46  5145     5    43    36    87     7     2  1730
   171     1  2989   104  2403     5  1906   104   364    18  4017 12335
     5    49  9467 15023    21   636     7  1492     5   634   767  8134
   301  1412   306     1  1096 16065   611  1412   724    23   389     1
  1096 41401    35  1245    96    25     2  2328  1115     7    21   153
     1    88   992     9     1    37    13   227    11     2  1412   306
   186     6 21974     6    48     6    65    27   210    42     6 11790
    59     4   848    16     4   268     5     4   584  1906     5   113
    40    23   134  5762    26    41   520     5  1

In [18]:
#one-hot code labels
labels=df_text.index.tolist()
encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
labels=encoder.fit_transform(labels)
labels=np.array(labels)
print(labels[0:5])

[[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [19]:
# Split Dataset
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state = 1)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state = 1)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(x_val.shape)
print(y_val.shape)

(6940, 500)
(6940, 16)
(867, 500)
(867, 16)
(868, 500)
(868, 16)


In [20]:
# sklearn sklearn MLP classifier
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(512,512,512,256,16), verbose=True, random_state=1)
#mlp.fit(x_train, y_train)
#acc = mlp.score(x_test, y_test)
#print(acc)

In [21]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [30]:
import torch.nn as nn
from torch.autograd import Variable
# reference: http://deeplearningathome.com/2017/06/PyTorch-vs-Tensorflow-lstm-language-model.html 
class LSTM(nn.Module):
    def __init__(self, embedding_dim, batch_size, vocab_size, num_layers, dp_keep_prob, out_dim, sample_len):
        super(LSTM, self).__init__()
        self.embedding_dim = embedding_dim
        #self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.out_dim = out_dim
        self.dp_keep_prob = dp_keep_prob
        self.num_layers = num_layers
        self.sample_len = sample_len

        self.dropout = nn.Dropout(1 - dp_keep_prob)
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=embedding_dim,
                            num_layers=num_layers,
                            dropout=1 - dp_keep_prob)
        self.sm_fc = nn.Linear(in_features=embedding_dim*sample_len,
                               out_features=out_dim)
        self.softmax = nn.Softmax()
        self.init_weights()

    def init_weights(self):
        init_range = 0.1
        self.word_embeddings.weight.data.uniform_(-init_range, init_range)
        self.sm_fc.bias.data.fill_(0.0)
        self.sm_fc.weight.data.uniform_(-init_range, init_range)

    def init_hidden(self):
        weight = next(self.parameters()).data
        return (Variable(weight.new(self.num_layers, self.batch_size, self.embedding_dim).zero_()),
                Variable(weight.new(self.num_layers, self.batch_size, self.embedding_dim).zero_()))

    def forward(self, inputs, hidden):
        embeds = self.word_embeddings(inputs)
        embeds = self.dropout(embeds)
        embeds = embeds.view(self.sample_len,self.batch_size, -1)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = self.dropout(lstm_out)
        logits = self.sm_fc(lstm_out.transpose(0,1).contiguous().view(self.batch_size, -1))
        out = self.softmax(logits.view(self.batch_size, self.out_dim))
        return out, hidden

def repackage_hidden(h):
    # use for multiple hidden layer
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

In [None]:
# LSTM RNN
from torch import optim

n_epochs = 1
hidden_size = 256
batch_size = 100
iteration = 1
embedding_dim = 250
vocab_size = 130463
num_layers = 1
dp_keep_prob = 0.35
out_dim = 16
sample_len = 500

model = LSTM(embedding_dim, batch_size, vocab_size, num_layers, dp_keep_prob, out_dim, sample_len)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr=0.01, weight_decay=0.04)
stop_loss = 10

for epoch in range(n_epochs):        
    hidden = model.init_hidden()
    hidden = repackage_hidden(hidden)
    
    for i, (x_batch, y_batch) in enumerate(get_batches(x_train, y_train, batch_size),1):
        # set training mode
        model.train()

        inputs = Variable(torch.from_numpy(x_batch.astype(np.int64)))
        targets = Variable(torch.from_numpy(y_batch).float())
        
        outputs,hidden = model(inputs, hidden)
        optimizer.zero_grad()
        loss = criterion(outputs, targets)
        loss.backward(retain_variables = True)
        optimizer.step()
        
        if (iteration%1)==0:
            print("Epoch: {}/{}".format(epoch, n_epochs),
                  "Iteration: {}".format(iteration),
                  "Train loss: {:.3f}".format(loss.data[0]))
            
            if loss.data[0] < stop_loss:
                stop_loss = loss.data[0]
            elif (stop_loss - loss.data[0]) > 0.01:
                print("loss larger than previous iteration, stop at loss:{}, best loss is:{}"
                      .format(loss.data[0], stop_loss)) 
                break
            
        if (iteration%5)==0:
            val_acc = []
            for val_x, val_y in get_batches(x_val, y_val, batch_size):
                
                inputs_val = Variable(torch.from_numpy(val_x.astype(np.int64)))
                targets_val = val_y
                
                # set eval mode
                model.eval()
                outputs_val, hidden = model(inputs_val, hidden)
                # transform output to one-hot
                outputs_val = outputs_val.data.numpy()
                outputs_val = (outputs_val == outputs_val.max(axis=1, keepdims=True)).astype(int)
                batch_acc = (outputs_val == targets_val).sum()/batch_size
                val_acc.append(batch_acc)    
            print("Val acc: {:.3f}".format(np.mean(val_acc)))
            if np.mean(val_acc) > 90:
                break

        iteration+=1
        
    if epoch > 0:
        print(epoch, loss.data[0])



Epoch: 0/1 Iteration: 1 Train loss: 0.062
Epoch: 0/1 Iteration: 2 Train loss: 0.059
Epoch: 0/1 Iteration: 3 Train loss: 0.057
Epoch: 0/1 Iteration: 4 Train loss: 0.056
Epoch: 0/1 Iteration: 5 Train loss: 0.057
Val acc: 14.398


In [47]:
# testing
test_acc = []
for test_x, test_y in get_batches(x_test, y_test, batch_size):
                
    inputs_test = Variable(torch.from_numpy(test_x))
    targets_test = test_y
    
    model.eval()
    outputs_test, hidden = model(inputs_test, hidden)
    outputs_test = outputs_test.round()
    outputs_test = outputs_test.data.numpy().astype(np.int64)
        
    batch_acc = (outputs_test == targets_test).sum()/batch_size
    test_acc.append(batch_acc)
print("Test acc: {:.3f}".format(np.mean(test_acc)))

Test acc: 15.000
