In [30]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import clear_output

import torch
import torch.nn as nn
import transformers
from transformers import (
    AutoModel, AutoConfig, 
    AutoTokenizer, logging
)

from parameters import *
from autoencoder import *
from utils import *

In [31]:
current_ds = pd.read_csv(TRAIN_DIR['current'])
old_ds = pd.read_csv(TRAIN_DIR['old'])
hewlett = pd.read_csv(TRAIN_DIR['hewlett'])

label_X = current_ds['full_text'].to_list()
unlabled_X = old_ds['texts'].to_list()
unlabled_X.extend(hewlett['essay'].to_list())
y = current_ds.iloc[:, 2].values

In [32]:
class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm
        self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, all_hidden_states):
        ## forward
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out

In [33]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [34]:
max_seq_length = 256
_pretrained_model = 'roberta-base'

config = AutoConfig.from_pretrained(_pretrained_model)
config.update({'output_hidden_states':True})
model = AutoModel.from_pretrained(_pretrained_model, config=config)
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(_pretrained_model)

clear_output()

embeddings = []

for i in tqdm(range(0, len(label_X), 16)):
    tail = i + 16 if i+16 < len(label_X) else len(label_X)
    
    features = tokenizer.batch_encode_plus(
        label_X[i:tail],
        add_special_tokens=True,
        padding='max_length',
        max_length=max_seq_length,
        truncation=True,
        return_tensors='pt',
        return_attention_mask=True
    )
    features = features.to(device)

    with torch.no_grad():
        outputs = model(features['input_ids'], features['attention_mask'])
    
    all_hidden_states = torch.stack(outputs[2])
    
    hiddendim_lstm = 256
    pooler = LSTMPooling(config.num_hidden_layers, config.hidden_size, hiddendim_lstm)
    pooler = pooler.to(device)
    embeddings.append(pooler(all_hidden_states).cpu().detach().numpy())


100%|██████████| 245/245 [00:22<00:00, 11.02it/s]


In [35]:
embeddings_np = np.concatenate(embeddings, axis=0)
embeddings_np.shape

(3911, 256)

In [36]:
np.save('labeled_data.npy', embeddings_np)

In [37]:
embeddings_2 = []

for i in tqdm(range(0, len(unlabled_X), 16)):
    tail = i + 16 if i+16 < len(unlabled_X) else len(unlabled_X)
    
    features = tokenizer.batch_encode_plus(
        unlabled_X[i:tail],
        add_special_tokens=True,
        padding='max_length',
        max_length=max_seq_length,
        truncation=True,
        return_tensors='pt',
        return_attention_mask=True
    )
    features = features.to(device)

    with torch.no_grad():
        outputs = model(features['input_ids'], features['attention_mask'])
    
    all_hidden_states = torch.stack(outputs[2])
    
    hiddendim_lstm = 256
    pooler = LSTMPooling(config.num_hidden_layers, config.hidden_size, hiddendim_lstm)
    pooler = pooler.to(device)
    embeddings_2.append(pooler(all_hidden_states).cpu().detach().numpy())


100%|██████████| 1786/1786 [02:47<00:00, 10.68it/s]


In [38]:
embeddings_2_np = np.concatenate(embeddings_2, axis=0)
embeddings_2_np.shape

(28570, 256)

In [39]:
np.save('unlabeled_data.npy', embeddings_2_np)

In [40]:
import pandas as pd
current_ds = pd.read_csv('./dataset/current/train.csv')
y = current_ds.iloc[:, 2:].values
np.save('labels.npy', y)

In [4]:
import pandas as pd
current_ds = pd.read_csv('./dataset/hewlett/train.csv')
current_ds

Unnamed: 0.1,Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,...,,,,,,,,,,
1,1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,...,,,,,,,,,,
2,2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,...,,,,,,,,,,
3,3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,...,,,,,,,,,,
4,4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12971,12971,21626,8,In most stories mothers and daughters are eit...,17,18,,35,,,...,4.0,4.0,4.0,3.0,,,,,,
12972,12972,21628,8,I never understood the meaning laughter is th...,15,17,,32,,,...,4.0,4.0,4.0,3.0,,,,,,
12973,12973,21629,8,"When you laugh, is @CAPS5 out of habit, or is ...",20,26,40.0,40,,,...,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0
12974,12974,21630,8,Trippin' on fen...,20,20,,40,,,...,4.0,4.0,4.0,4.0,,,,,,
