## Balance data

https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/



## SJ's Code

https://github.com/declare-lab/conv-emotion/blob/0c9dcb9cc5234a7ca8cf6af81aabe28ef3814d0e/DialogueRNN/train_E2E.py#L81

### load processed data

In [308]:
import pandas as pd

df = pd.read_csv('mbti_rm_stop_lemmatized.csv', index_col=None) 
df.head()

Unnamed: 0,type,posts_length,lemmatized
0,INFJ,304,enfp intj moments sportscenter top ten play pr...
1,ENTP,554,find lack post alarm sex bore position often e...
2,INTP,427,good one course say know bless curse absolutel...
3,INTJ,521,dear intp enjoy conversation day esoteric gabb...
4,ENTJ,477,fire another silly misconception approach logi...


### encode labels

In [309]:
int2mbti={0:'ENFJ',1:'ENFP',2:'ENTJ',3:'ENTP',4:'ESFJ',5:'ESFP',6:'ESTJ',7:'ESTP',8:'INFJ',9:'INFP',10:'INTJ',11:'INTP',12:'ISFJ',13:'ISFP',14:'ISTJ',15:'ISTP'}
mbti2int={'ENFJ':0,'ENFP':1,'ENTJ':2,'ENTP':3,'ESFJ':4,'ESFP':5,'ESTJ':6,'ESTP':7,'INFJ':8,'INFP':9,'INTJ':10,'INTP':11,'ISFJ':12,'ISFP':13,'ISTJ':14,'ISTP':15}

In [310]:
labels = df.type.tolist()
labels = [mbti2int.get(label) for label in labels]
print(labels[-50:])

[11, 10, 1, 15, 11, 13, 9, 14, 3, 9, 8, 9, 10, 9, 3, 11, 9, 8, 3, 8, 8, 9, 11, 9, 9, 8, 15, 12, 9, 14, 8, 8, 10, 4, 1, 9, 3, 10, 11, 11, 3, 10, 3, 10, 8, 13, 1, 11, 9, 9]


### clean posts

In [311]:
!pip install autocorrect



In [312]:
import nltk
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
from autocorrect import Speller 

nltk.download('wordnet')
nltk.download('stopwords')

class data_preprocessing():
    
    def remove_links(text):
        remove_https = re.sub(r'http\S+', '', text)
        remove_com = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
        return remove_com
    
    def remove_digits(text):
        return re.sub(r'\d+', ' ', text)
    
    def remove_symbols(text):
        REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
        BAD_SYMBOLS_RE = re.compile('(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)')
        t = REPLACE_BY_SPACE_RE.sub(' ', text)
        t = BAD_SYMBOLS_RE.sub(' ', t)
        return t
    
    def deduce_repeated_characters(text):
        Pattern_alpha = re.compile(r"([A-Za-z])\1{1,}", re.DOTALL)
        Formatted_text = Pattern_alpha.sub(r"\1\1", text) 
        Pattern_Punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
        Combined_Formatted = Pattern_Punct.sub(r'\1', Formatted_text)
        Final_Formatted = re.sub(' {2,}',' ', Combined_Formatted)
        return Final_Formatted
    
    def remove_special_characters(text):
        return re.sub(r"[^a-zA-Z0-9:$-,%.?!]+", ' ', text)
    
    def spelling_correction(text):
        spell = Speller(lang='en')
        Corrected_text = spell(text)
        return Corrected_text
    
    def lemmatization(text):
        w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
        lemmatizer = nltk.stem.WordNetLemmatizer()
        lemma = [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]
        return lemma

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [313]:
STOPWORDS = set(stopwords.words('english'))

def clean_text(t):
    t = t.lower()
    t = t.replace("|||"," ")   
    t = data_preprocessing.remove_links(t)
    t = data_preprocessing.remove_digits(t)
    t = data_preprocessing.remove_symbols(t)
    t = data_preprocessing.deduce_repeated_characters(t)
    t = data_preprocessing.remove_special_characters(t)
#         t = data_preprocessing.spelling_correction(t)
    t = data_preprocessing.lemmatization(t)
    t = ' '.join(word for word in t.split() if word not in STOPWORDS) 
    return t

In [314]:
posts = df.lemmatized.tolist()
# posts = [clean_text(post) for post in posts]
print(posts[0])

enfp intj moments sportscenter top ten play prank life change experience life repeat today may perc experience immerse last thing infj friend post facebook commit suicide next day rest peace hello enfj sorry hear distress natural relationship perfection time every moment existence try figure hard time time growth welcome stuff game set match prozac wellbrutin least thirty minutes move legs mean move sit desk chair weed moderation maybe try edibles healthy alternative basically come three items determine type whichever type want would likely use give type cognitive function whatnot leave things moderation sims indeed video game good one note good one somewhat subjective completely promote death give sim dear enfp favorite video game grow current favorite video game cool appear late sad someone everyone wait think confidence good thing cherish time solitude revel within inner world whereas time workin enjoy time worry people always around yo entp ladies complimentary personality well hey

### create vocab

In [315]:
# Count total words
from collections import Counter

word_count=Counter()
for post in posts:
    if isinstance(post, str):
        word_count.update(post.split(" "))

In [316]:
# Size of the vocabulary available to the RNN
vocab_len=len(word_count)
print(vocab_len)

print(len(posts[0]))

84822
1946


### encode posts

In [317]:
# Create a look up table 
vocab = sorted(word_count, key=word_count.get, reverse=True)
# Create your dictionary that maps vocab words to integers here
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

posts_ints=[]
for post in posts:
    if isinstance(post, str):
        posts_ints.append([vocab_to_int[word] for word in post.split()])

# print(posts_ints[0])
print(len(posts_ints[0]))

304


In [318]:
import torchtext as text

# load glove embeddings
vec = text.vocab.GloVe(name='6B', dim=50)
# create the embedding matrix, a torch tensor in the shape (num_words+1, embedding_dim)
word_emb = vec.get_vecs_by_tokens(vocab)

### padding

In [319]:
import numpy as np

posts_lens = Counter([len(x) for x in posts])
print("Zero-length reviews: {}".format(posts_lens[0]))
print("Maximum review length: {}".format(max(posts_lens)))
print("Minimum review length: {}".format(min(posts_lens)))


seq_len = 500
features=np.zeros((len(posts_ints),seq_len),dtype=int)
for i, row in enumerate(posts_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]
print(features[:10])

Zero-length reviews: 0
Maximum review length: 6216
Minimum review length: 33
[[    0     0     0 ...    66   119    38]
 [   21   424    45 ...   602   377  1030]
 [    0     0     0 ...  1622  1140   200]
 ...
 [  147   563  2826 ...   119     1   432]
 [    0     0     0 ... 13468  6006    92]
 [  873    51   112 ...  1509  8225    33]]


In [320]:
print(len(labels))
print(len(features))

8674
8674


### create datasets

In [321]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

In [322]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = 40000
seq_len = 500
num_labels = 16
EMBEDDING_DIM=50

# device = torch.device('cpu')
print("deive type: ",device)

deive type:  cuda


In [323]:
# train test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=50, 
                                                    test_size=0.2, stratify = labels )
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, random_state=0, 
                                                    test_size=0.5, stratify = y_test )

print(len(x_train),len(y_train))
print(len(x_test),len(y_test))
print(len(x_val),len(y_val))

6939 6939
867 867
868 868


In [324]:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_post, _label) in batch:
        label_list.append(_label)
        text_list.append(_post)
    # label must be in the same size as target
    label_list = torch.tensor(label_list)
    text_list = torch.stack(text_list)
    return text_list.to(device), label_list.to(device)

In [325]:
from torch.utils.data import TensorDataset, DataLoader
# # create Tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train).to(device), torch.tensor(y_train).to(device))
test_data = TensorDataset(torch.from_numpy(x_test).to(device),torch.tensor(y_test).to(device))
val_data = TensorDataset(torch.from_numpy(x_val).to(device),torch.tensor(y_val).to(device))

# # dataloaders
batch_size = 256

# # make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, collate_fn=collate_batch)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size, collate_fn=collate_batch)
val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size, collate_fn=collate_batch)

### LSTM model

In [326]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

In [327]:
class SimpleAttention(nn.Module):

    def __init__(self, input_dim):
        super(SimpleAttention, self).__init__()
        self.input_dim = input_dim
        self.scalar = nn.Linear(self.input_dim,num_labels,bias=False)

    def forward(self, M, x=None):
        """
        M -> (seq_len, batch, vector)
        x -> dummy argument for the compatibility with MatchingAttention
        """
        scale = self.scalar(M) # seq_len, batch, 1
        alpha = F.softmax(scale, dim=0).permute(1,2,0) # batch, 1, seq_len
        attn_pool = torch.bmm(alpha, M.transpose(0,1))[:,0,:] # batch, vector

        return attn_pool, alpha

In [328]:
# logistic model
class LogisticRegression(nn.Module):
    def __init__(self, word_vec, embed_dim):
        super().__init__()
        # embeddingbag outputs the average of all the words in a sentence
        self.embedding = nn.Embedding(*(word_vec.size())).from_pretrained(word_vec, freeze=False)
        self.fc = nn.Linear(embed_dim, num_labels)
        self._init_weights()

    def _init_weights(self):
        """Initialize network parameters 
        """
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        embedded = self.embedding(text) # (batch_size, sent_len, emb_size)
        embedded = embedded.sum(dim = 1) / lengths[:, None] # (add one axis)
        return torch.sigmoid(self.fc(embedded))

class LSTMcustom(nn.Module):
    def __init__(self, word_vec, embed_dim):
        super().__init__()
        # embeddingbag outputs the average of all the words in a sentence
        self.embedding = nn.Embedding(*(word_vec.size())).from_pretrained(word_vec, freeze=False)
        self.cnn = torch.nn.Conv1d(embed_dim,20,2)
        self.lstm = nn.LSTM(20, 200, 1, bidirectional=False, batch_first = True)      
        self.attention = SimpleAttention(200)  
        self.linears = nn.Sequential(
            nn.Linear(200, 128),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(128, num_labels),
#             nn.Softmax()
        )
        
                
    def forward(self, text):
        embedded = self.embedding(text) # (batch_size, sent_len, emb_size)
        # print("embedded: ", embedded.size())
        
        embedded = embedded.permute(0,2,1)
        cnn_out = self.cnn(embedded)
        # print("cnn_out: ", cnn_out.size())
        
        cnn_out = cnn_out.permute(0,2,1)
        lstm_out,_ = self.lstm(cnn_out) # lstm_out is a 3d tensor (batch_size, seq_len, output_size). If you have a bidirectional LSTM, the outputsize will be 2*output_size
        # print("lstm_out: ", lstm_out.size())
        lstm_out = lstm_out.permute(1,0,2)

        atten_out, alpha = self.attention(lstm_out)       
        # print("atten_out: ", atten_out.size()) 
        # print("atten_out (fc): ",self.fc(atten_out).size())
        
        return torch.log_softmax(torch.tanh(self.linears(atten_out)),1)

In [329]:
import time

def train(dataloader):
#     model.train()
    total_acc, total_count = 0, 0
    log_interval = 20
    start_time = time.time()

    for idx, (text, label) in enumerate(dataloader):
        model.train()
        optimizer.zero_grad()
        # forward propagation
        predicted_label = model(text)
        # label = torch.reshape(label,(len(label),1))
        # calculate loss and backpropagate to model paramters
#         print("predicted label size: ",predicted_label.size())
#         print("label size: ",label.size())
        loss = criterion(predicted_label, label)
        # print(loss)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        # update parameters by stepping the optimizer
        optimizer.step()
        predicted_label = torch.argmax(predicted_label,1)
        # print(predicted_label)
        total_acc += (predicted_label == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}| loss {:8f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count,loss.item()))
            total_acc, total_count = 0, 0
            start_time = time.time()
#             evaluate(val_loader)

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    print('val accuracy {:8.2f} | val loss {:8f}'.format(total_acc/total_count,loss.item()))
    return total_acc/total_count

In [332]:
# Hyperparameters
EPOCHS = 120 # epoch

model = LSTMcustom(word_vec=word_emb, embed_dim=EMBEDDING_DIM).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss().to(device)
total_accu = None


for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_loader)

| epoch   1 |    20/   28 batches | accuracy    0.195| loss 2.336855
| epoch   2 |    20/   28 batches | accuracy    0.211| loss 2.345699
| epoch   3 |    20/   28 batches | accuracy    0.211| loss 2.311393
| epoch   4 |    20/   28 batches | accuracy    0.214| loss 2.245397
| epoch   5 |    20/   28 batches | accuracy    0.223| loss 2.073625
| epoch   6 |    20/   28 batches | accuracy    0.231| loss 2.075429
| epoch   7 |    20/   28 batches | accuracy    0.232| loss 2.077781
| epoch   8 |    20/   28 batches | accuracy    0.240| loss 1.947394
| epoch   9 |    20/   28 batches | accuracy    0.254| loss 1.938933
| epoch  10 |    20/   28 batches | accuracy    0.246| loss 1.912148
| epoch  11 |    20/   28 batches | accuracy    0.250| loss 1.856289
| epoch  12 |    20/   28 batches | accuracy    0.275| loss 1.871767
| epoch  13 |    20/   28 batches | accuracy    0.241| loss 1.935206
| epoch  14 |    20/   28 batches | accuracy    0.270| loss 1.855201
| epoch  15 |    20/   28 batches 

In [333]:
accu_test = evaluate(test_loader)
print('test accuracy {:8.2f}%'.format(accu_test))

val accuracy     0.32 | val loss 2.176192
test accuracy     0.32%
