In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import string
import torch
import pickle

In [2]:
# load dataset
df_text=pd.read_csv("../dataset/mbti_1.csv",index_col='type', encoding='utf-8')
print(df_text.shape)
print(df_text[0:5])

(8675, 1)
                                                  posts
type                                                   
INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
ENTP  'I'm finding the lack of me in these posts ver...
INTP  'Good one  _____   https://www.youtube.com/wat...
INTJ  'Dear INTP,   I enjoyed our conversation the o...
ENTJ  'You're fired.|||That's another silly misconce...


In [3]:
# preprocess text
text_ls = df_text.posts.tolist()
        
def cleaner(text):
    # remove URL
    text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', text, flags=re.MULTILINE) 
    #replace |||
    text = text.replace('|||',"")
    # remove punctuation
    text = re.sub(r'[^\w\s]',' ',text)
    #lowercase every character
    text = text.lower()
    # remove white space
    text=re.sub('\s+', ' ', text).strip()
                
    return text
    
texts = [cleaner(text) for text in text_ls]
# validate preprocessing
print(texts[0])

and intj moments sportscenter not top ten plays prankswhat has been the most life changing experience in your life on repeat for most of today may the perc experience immerse you the last thing my infj friend posted on his facebook before committing suicide the next day rest in peace enfj7 sorry to hear of your distress it s only natural for a relationship to not be perfection all the time in every moment of existence try to figure the hard times as times of growth as 84389 84390 welcome and stuff game set match prozac wellbrutin at least thirty minutes of moving your legs and i don t mean moving them while sitting in your same desk chair weed in moderation maybe try edibles as a healthier alternative basically come up with three items you ve determined that each type or whichever types you want to do would more than likely use given each types cognitive functions and whatnot when left by all things in moderation sims is indeed a video game and a good one at that note a good one at tha

In [4]:
from collections import Counter

word_count=Counter()
for text in texts:
    word_count.update(text.split(" "))
    
print(len(word_count))

130463


In [5]:
# create a vocabulary dictionary and transfer words to word representation
vocab = sorted(word_count, key=word_count.get, reverse=True)
vocab_to_int = {word: num for num, word in enumerate(vocab, 1)}

text_rep = []
for text in texts:
    text_rep.append([vocab_to_int[word] for word in text.split()])
    
print(text_rep[0])

[5, 131, 1320, 54993, 24, 697, 1866, 2114, 54994, 92, 74, 2, 87, 103, 1459, 289, 11, 41, 103, 26, 2218, 16, 87, 7, 366, 199, 2, 729, 289, 12334, 6, 2, 242, 114, 12, 128, 145, 576, 26, 136, 1045, 186, 5945, 2156, 2, 467, 183, 778, 11, 1341, 54995, 279, 3, 418, 7, 41, 7214, 8, 14, 89, 825, 16, 4, 240, 3, 24, 22, 3975, 45, 2, 62, 11, 221, 478, 7, 1531, 155, 3, 527, 2, 203, 246, 28, 246, 7, 2434, 28, 54996, 54997, 300, 5, 275, 492, 711, 1224, 12622, 54998, 44, 259, 6747, 1003, 7, 1085, 41, 3315, 5, 1, 37, 13, 162, 1085, 66, 171, 1114, 11, 41, 132, 2821, 3878, 1992, 11, 6971, 152, 155, 27506, 28, 4, 5946, 2224, 519, 215, 63, 18, 556, 3462, 6, 53, 2800, 9, 315, 94, 35, 6572, 223, 6, 84, 3, 36, 51, 49, 95, 409, 225, 635, 315, 223, 640, 348, 5, 4362, 42, 508, 75, 45, 80, 11, 6971, 5245, 10, 1037, 4, 463, 492, 5, 4, 79, 47, 44, 9, 937, 4, 79, 47, 44, 9, 10, 842, 1484, 11, 9, 1, 60, 24, 356, 10108, 2, 677, 7, 111, 635, 11293, 607, 172, 33, 133, 41, 420, 463, 564, 1142, 63, 5, 33, 23, 41, 96, 951

In [6]:
# since each data has different length of words, need to uniform all feature length
lengths = [len(rep) for rep in text_rep]
print(sum(lengths)/len(lengths))

1311.4900288184438


In [7]:
# let's set the length as 1200
sample_len = 1200
features =np.zeros((len(text_rep),sample_len),dtype=int)
for i, sample in enumerate(text_rep):
    # fill 0 for samples has length < 1200
    features[i, -len(sample):] = np.array(sample)[:sample_len]
print(features[1])
print(features.shape)

[  1  20 761 ...,   1 117  17]
(8675, 1200)


In [8]:
#one-hot code labels
labels=df_text.index.tolist()
encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
labels=encoder.fit_transform(labels)
labels=np.array(labels)
print(labels[0:5])

[[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [9]:
# Split Dataset
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state = 1)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state = 1)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(x_val.shape)
print(y_val.shape)

(6940, 1200)
(6940, 16)
(867, 1200)
(867, 16)
(868, 1200)
(868, 16)


In [10]:
# sklearn sklearn MLP classifier
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(512,512,512,256,16), verbose=True, random_state=1)
mlp.fit(x_train, y_train)
acc = mlp.score(x_test, y_test)
print(acc)

Iteration 1, loss = 22.92849719
Iteration 2, loss = 10.74641705
Iteration 3, loss = 10.51665288
Iteration 4, loss = 10.29333402
Iteration 5, loss = 10.07656054
Iteration 6, loss = 9.86627384
0.0818915801615




In [13]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [31]:
import torch.nn as nn
from torch.autograd import Variable
# reference: http://deeplearningathome.com/2017/06/PyTorch-vs-Tensorflow-lstm-language-model.html 
class LSTM(nn.Module):
    def __init__(self, embedding_dim, num_steps, batch_size, vocab_size, num_layers, dp_keep_prob):
        super(LM_LSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.dp_keep_prob = dp_keep_prob
        self.num_layers = num_layers
        self.dropout = nn.Dropout(1 - dp_keep_prob)
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=embedding_dim,
                            num_layers=num_layers,
                            dropout=1 - dp_keep_prob)
        self.sm_fc = nn.Linear(in_features=embedding_dim,
                               out_features=vocab_size)
        self.init_weights()

    def init_weights(self):
        init_range = 0.1
        self.word_embeddings.weight.data.uniform_(-init_range, init_range)
        self.sm_fc.bias.data.fill_(0.0)
        self.sm_fc.weight.data.uniform_(-init_range, init_range)

    def init_hidden(self):
        weight = next(self.parameters()).data
        return (Variable(weight.new(self.num_layers, self.batch_size, self.embedding_dim).zero_()),
                Variable(weight.new(self.num_layers, self.batch_size, self.embedding_dim).zero_()))

    def forward(self, inputs, hidden):
        embeds = self.dropout(self.word_embeddings(inputs))
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = self.dropout(lstm_out)
        logits = self.sm_fc(lstm_out.view(-1, self.embedding_dim))
        return logits.view(self.num_steps, self.batch_size, self.vocab_size), hidden

    def repackage_hidden(h):
        if type(h) == Variable:
            return Variable(h.data)
        else:
            return tuple(repackage_hidden(v) for v in h)

In [33]:
# LSTM RNN
from torch import optim

n_epochs = 3
hidden_size = 256
batch_size = 50
iteration = 50
embedding_dim = 1200
num_steps = 35
vocab_size = 130463
num_layers = 1
dp_keep_prob = 0.35


model = LSTM(embedding_dim, num_steps, batch_size, vocab_size, num_layers, dp_keep_prob)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(n_epochs):

    #for iter in range(n_iters):
    for i, (x_batch, y_batch) in enumerate(get_batches(x_train, y_train, batch_size),1):
        
        inputs = Variable(torch.from_numpy(x_batch).float())
        targets = Variable(torch.from_numpy(y_batch).float())

        outputs = model(inputs)

        optimizer.zero_grad()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        if iteration%5==0:
            print("Epoch: {}/{}".format(e, epochs),
                  "Iteration: {}".format(iteration),
                  "Train loss: {:.3f}".format(loss))
            
        if iteration %25 == 0:
            val_acc = []
            for val_x, val_y in get_batches(x_val, y_val, batch_size):
                
                inputs_val = Variable(torch.from_numpy(val_x))
                targets_val = Variable(torch.from_numpy(val_y))
    
                outputs_val, hidden = model(inputs_val, batch_size)
        
                batch_acc = (outputs_val == targets_val).sum()/batch_size
                val_acc.append(batch_acc)
            print("Val acc: {:.3f}".format(np.mean(val_acc)))

        iteration+=1
        
    if epoch > 0:
        print(epoch, loss.data[0])

AttributeError: 'tuple' object has no attribute 'dim'

In [None]:
# testing
test_acc = []
for test_x, test_y in get_batches(x_test, y_test, batch_size):
                
    inputs_test = Variable(torch.from_numpy(test_x))
    targets_test = Variable(torch.from_numpy(test_y))
    
    outputs_test, hidden = model(inputs_test, batch_size)
        
    test_acc = (outputs_test == targets_test).sum()/batch_size
    test_acc.append(batch_acc)
print("Test acc: {:.3f}".format(np.mean(test_acc))

In [None]:
# 1. pure word count
# 2. if 1. doesn't work, try glovec
