In [62]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [63]:
!pip install allennlp-models
!pip install allennlp



In [64]:
from typing import Dict, Iterable, List

import torch
import pandas as pd
from allennlp.data import DatasetReader, Instance, Vocabulary, TextFieldTensors
from allennlp.data.fields import LabelField, TextField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WhitespaceTokenizer, CharacterTokenizer
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder, Seq2VecEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding, TokenCharactersEncoder
from allennlp.modules.seq2vec_encoders import LstmSeq2VecEncoder, CnnEncoder
from allennlp.nn import util
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.data_loaders import MultiProcessDataLoader, DataLoader


In [65]:
class SentimentDataReader(DatasetReader):
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        char_tokenizer: CharacterTokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        max_tokens: int = None,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.tokenizer = tokenizer or WhitespaceTokenizer()
        self.char_tokenizer = char_tokenizer or CharacterTokenizer()
        if token_indexers is None:
            self.token_indexers = {
                    "tokens": SingleIdTokenIndexer(namespace="tokens"),
                    "token_characters": TokenCharactersIndexer(namespace="token_characters")
                }
        else:
            self.token_indexers = token_indexers
        self.max_tokens = max_tokens

    def text_to_instance(self, text, label):
        tokens = self.tokenizer.tokenize(text)
        if self.max_tokens:
                tokens = tokens[: self.max_tokens]

        text_field = TextField(tokens, self.token_indexers)
        fields = {
            "tokens": text_field
        }
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)

    def _read(self, file_path: str) -> Iterable[Instance]:
        df = pd.read_csv(file_path)
        for _, row in df.iterrows():
            text = row['text']
            sentiment = row['topic']
            yield self.text_to_instance(text, sentiment)


In [66]:
instances = SentimentDataReader().read('/content/drive/MyDrive/BTL_NLP/data/val.csv')
c = 0
for i in instances:
  print(i)
  c+=1
  if c> 5:
    break

Instance with fields:
 	 tokens: TextField of length 6 with text: 
 		[giáo, trình, chưa, cụ, thể, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer', 'token_characters': 'TokenCharactersIndexer'} 
 	 label: LabelField with label: program in namespace: 'labels'. 

Instance with fields:
 	 tokens: TextField of length 4 with text: 
 		[giảng, buồn, ngủ, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer', 'token_characters': 'TokenCharactersIndexer'} 
 	 label: LabelField with label: lecturer in namespace: 'labels'. 

Instance with fields:
 	 tokens: TextField of length 8 with text: 
 		[giáo, viên, vui, tính, ,, tận, tâm, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer', 'token_characters': 'TokenCharactersIndexer'} 
 	 label: LabelField with label: lecturer in namespace: 'labels'. 

Instance with fields:
 	 tokens: TextField of length 37 with text: 
 		[giảng, viên, nên, giao, bài, tập, nhiều, hơn, ,, chia, nhóm, để, làm, bài, tập, ,, giảng, kỹ,
		những, vấn



In [67]:
from allennlp.training.metrics import CategoricalAccuracy, F1Measure

class SimpleClassifier(Model):
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder):
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder
        num_labels = vocab.get_vocab_size("labels")
        self.classifier = torch.nn.Linear(encoder.get_output_dim(), num_labels)
        self.accuracy = CategoricalAccuracy()
        self.f1_measure = F1Measure(1)

    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor) -> Dict[str, torch.Tensor]:
        # Shape: (batch_size, num_tokens, embedding_dim)
        embedded_text = self.embedder(tokens)
        # Shape: (batch_size, num_tokens)
        mask = util.get_text_field_mask(tokens)
        # Shape: (batch_size, encoding_dim)
        encoded_text = self.encoder(embedded_text, mask)
        # Shape: (batch_size, num_labels)
        logits = self.classifier(encoded_text)
        # Shape: (batch_size, num_labels)
        probs = torch.nn.functional.softmax(logits)
        # Shape: (1,)
        output = {
            'probs': probs,
            'logits': logits
        }
        if label is not None:
            loss = torch.nn.functional.cross_entropy(logits, label)
            output['loss'] = loss
            self.accuracy(logits, label)
            self.f1_measure(logits, label)

        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics_result = self.f1_measure.get_metric(reset)
        metrics_result['accuracy'] = self.accuracy.get_metric(reset)
        
        return metrics_result

In [68]:
import tempfile
from typing import Dict, Iterable, List, Tuple
import shutil
import os

import allennlp
import torch
from allennlp.data import (
    DataLoader,
    DatasetReader,
    Instance,
    Vocabulary,
    TextFieldTensors,
)
from allennlp.data.data_loaders import SimpleDataLoader
from allennlp.data.fields import LabelField, TextField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WhitespaceTokenizer
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder, Seq2VecEncoder
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.nn import util
from allennlp.training.trainer import GradientDescentTrainer, Trainer
from allennlp.training.optimizers import AdamOptimizer
from allennlp.training.metrics import CategoricalAccuracy


train_path = '/content/drive/MyDrive/BTL_NLP/data/train.csv'
test_path = '/content/drive/MyDrive/BTL_NLP/data/test.csv'
val_path = '/content/drive/MyDrive/BTL_NLP/data/val.csv'

def build_model(
    vocab,
    embedding_dim=100,
    hidden_size=128,
    char_embedding_dim=34,
    dropout=0.4,
    bidirectional=True,
    word_embedding_pretrain_file=None
):
     # token embedding
    embedding = Embedding(embedding_dim=embedding_dim, vocab_namespace='tokens', vocab=vocab, pretrained_file=word_embedding_pretrain_file)
        
    # char embedding with cnnencoder
    character_embedding = Embedding(embedding_dim=char_embedding_dim, vocab_namespace='token_characters', vocab=vocab)
    cnn_encoder = CnnEncoder(embedding_dim=char_embedding_dim, num_filters=char_embedding_dim, ngram_filter_sizes=(3,))
    token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder)

    embedder = BasicTextFieldEmbedder(
        {
            "tokens": embedding,
            "token_characters": token_encoder
        }
    )
        
    encoder = LstmSeq2VecEncoder(input_size=embedder.get_output_dim(), hidden_size=hidden_size, num_layers=2, bidirectional=bidirectional, dropout=dropout)

    model = SimpleClassifier(
        vocab,
        embedder,
        encoder
    )
    model.to('cuda')
    return model

def build_dataset_reader(max_tokens=80) -> DatasetReader:
    return SentimentDataReader(max_tokens=max_tokens)


def read_data(reader: DatasetReader, train_path, val_path) -> Tuple[List[Instance], List[Instance]]:
    print("Reading data")
    training_data = list(reader.read(train_path))
    validation_data = list(reader.read(val_path))
    return training_data, validation_data


def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
    print("Building the vocabulary")
    return Vocabulary.from_instances(instances)

def build_data_loaders(
    train_data: List[Instance],
    dev_data: List[Instance],
    batch_size=64
) -> Tuple[DataLoader, DataLoader]:
    train_loader = SimpleDataLoader(train_data, batch_size, shuffle=True)
    dev_loader = SimpleDataLoader(dev_data, batch_size, shuffle=False)
    return train_loader, dev_loader

def build_trainer(
    model,
    serialization_dir,
    train_loader,
    dev_loader,
    num_epochs=7,
    grad_clipping=5,
    weight_decay=0.0001
):
    parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=0.001, weight_decay=weight_decay)  # type: ignore
    trainer = GradientDescentTrainer(
        model=model,
        serialization_dir=serialization_dir,
        data_loader=train_loader,
        validation_data_loader=dev_loader,
        num_epochs=num_epochs,
        optimizer=optimizer,
        grad_clipping=grad_clipping,
        cuda_device=0

    )
    return trainer

def run_training_loop(
    serialization_dir='models',
    checkpoint=False,
    embedding_dim=100,
    hidden_size=128,
    char_embedding_dim=34,
    dropout=0.4,
    bidirectional=True,
    word_embedding_pretrain_file=None,
    num_epochs=7,
    grad_clipping=5,
    weight_decay=0.0001
):    

    print('num_epochs', num_epochs)
    print('word_embedding_pretrain_file', word_embedding_pretrain_file)
    print('dropout', dropout)
    print('grad_clipping', grad_clipping)
    print('weight_decay', weight_decay)
    if checkpoint is False:
        if os.path.exists(serialization_dir):
            shutil.rmtree(serialization_dir)

    dataset_reader = build_dataset_reader(max_tokens=80)

    train_data, dev_data = read_data(dataset_reader, train_path, val_path)

    vocab = build_vocab(train_data + dev_data)

    print(vocab)
    model = build_model( 
        vocab,
        embedding_dim=embedding_dim,
        hidden_size=hidden_size,
        char_embedding_dim=char_embedding_dim,
        dropout=dropout,
        bidirectional=bidirectional,
        word_embedding_pretrain_file=word_embedding_pretrain_file
      )

    print(model)
    
    train_loader, dev_loader = build_data_loaders(train_data, dev_data, batch_size=128)
    train_loader.index_with(vocab)
    dev_loader.index_with(vocab)

    trainer = build_trainer(model,
                            serialization_dir, 
                            train_loader, 
                            dev_loader,
                            num_epochs=num_epochs,
                            grad_clipping=grad_clipping,
                            weight_decay=weight_decay
                            )

  
    trainer.train()


    return model, dataset_reader, vocab

In [69]:
model, dataset_reader, vocab = run_training_loop(
    serialization_dir='models',
    checkpoint=False,
    bidirectional=True,
    embedding_dim=100,
    hidden_size=128,
    char_embedding_dim=34,
    dropout=0.4,
    word_embedding_pretrain_file='/content/drive/MyDrive/BTL_NLP/pretrained/viki_adapt/viki_adapt_w2v.txt',
    num_epochs=5,
    grad_clipping=5,
    weight_decay=0.0001
)

num_epochs 5
word_embedding_pretrain_file /content/drive/MyDrive/BTL_NLP/pretrained/viki_adapt/viki_adapt_w2v.txt
dropout 0.4
grad_clipping 5
weight_decay 0.0001
Reading data


building vocab:  11%|#         | 1376/13009 [00:00<00:00, 13752.83it/s]

Building the vocabulary


building vocab: 100%|##########| 13009/13009 [00:00<00:00, 13667.23it/s]
100%|##########| 20577/20577 [00:00<00:00, 145771.98it/s]


Vocabulary with namespaces:
 	Non Padded Namespaces: {'*tags', '*labels'}
 	Namespace: tokens, Size: 2620 
 	Namespace: token_characters, Size: 120 
 	Namespace: labels, Size: 4 

SimpleClassifier(
  (embedder): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
    (token_embedder_token_characters): TokenCharactersEncoder(
      (_embedding): TimeDistributed(
        (_module): Embedding()
      )
      (_encoder): TimeDistributed(
        (_module): CnnEncoder(
          (_activation): ReLU()
          (conv_layer_0): Conv1d(34, 34, kernel_size=(3,), stride=(1,))
        )
      )
    )
  )
  (encoder): LstmSeq2VecEncoder(
    (_module): LSTM(134, 128, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
  )
  (classifier): Linear(in_features=256, out_features=4, bias=True)
)


You provided a validation dataset but patience was set to None, meaning that early stopping is disabled
precision: 0.6386, recall: 0.6206, f1: 0.6295, accuracy: 0.8145, batch_loss: 0.4253, loss: 0.4960 ||: 100%|##########| 90/90 [00:05<00:00, 16.80it/s]
precision: 0.6894, recall: 0.7566, f1: 0.7214, accuracy: 0.8629, batch_loss: 0.2760, loss: 0.3455 ||: 100%|##########| 13/13 [00:00<00:00, 52.28it/s]
precision: 0.7677, recall: 0.7974, f1: 0.7823, accuracy: 0.8879, batch_loss: 0.3297, loss: 0.3113 ||: 100%|##########| 90/90 [00:05<00:00, 17.10it/s]
precision: 0.6706, recall: 0.8464, f1: 0.7483, accuracy: 0.8718, batch_loss: 0.2953, loss: 0.3497 ||: 100%|##########| 13/13 [00:00<00:00, 53.75it/s]
precision: 0.8085, recall: 0.8305, f1: 0.8194, accuracy: 0.9075, batch_loss: 0.1998, loss: 0.2611 ||: 100%|##########| 90/90 [00:05<00:00, 17.19it/s]
precision: 0.7560, recall: 0.7079, f1: 0.7311, accuracy: 0.8749, batch_loss: 0.2414, loss: 0.3355 ||: 100%|##########| 13/13 [00:00<00:00, 53.83it

In [70]:
from allennlp.training.util import evaluate

test_data = list(dataset_reader.read(test_path))
data_loader = SimpleDataLoader(test_data, batch_size=64)
data_loader.index_with(model.vocab)
vocab.save_to_files('/content/drive/MyDrive/BTL_NLP/topic/vocabulary')
results = evaluate(model.to('cpu'), data_loader)
print(results)

vocabulary serialization directory /content/drive/MyDrive/BTL_NLP/topic/vocabulary is not empty
precision: 0.77, recall: 0.68, f1: 0.72, accuracy: 0.87, loss: 0.34 ||: : 50it [00:03, 16.48it/s]

{'precision': 0.7652859687805176, 'recall': 0.6783216595649719, 'f1': 0.719184398651123, 'accuracy': 0.8711307643714467, 'loss': 0.3364033916592598}





In [71]:
!cp -rf models/best.th /content/drive/MyDrive/BTL_NLP/topic

In [None]:
def predict(self, text: str):
        tokens = [Token(token) for token in text.split(' ')]
        text_field = TextField(tokens, self._token_indexers)
        instance = Instance({
            "tokens": text_field
        })
        output = model.forward_on_instance(instance)
        y_prediction = np.argmax(output['probs'], axis=-1)
        y_probs = np.max(output['probs'], axis=-1)
        prediction = vocab.get_token_from_index(y_prediction, namespace='labels')
        return prediction

In [None]:
texts = text_df['text'].tolist()
predictions = [predict(text) for text in texts]
text_df['sentiment_pred'] = predictions
test_df_false = test_df[test_df['sentiment'] != test_df['sentiment_pred']]
test_df_false.to_csv('/content/drive/MyDrive/BTL_NLP/false_sent_pred.csv', index=False)

test_df_true = test_df[test_df['sentiment'] == test_df['sentiment_pred']]
test_df_true.to_csv('/content/drive/MyDrive/BTL_NLP/true_sent_pred.csv', index=False)