<a href="https://colab.research.google.com/github/sungmino/TestGit/blob/master/SAV_smn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!nvidia-smi

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
!pip install fairseq

In [None]:
!pip install transformers

In [None]:
!pip install fastBPE

In [None]:
!pip install vncorenlp

In [None]:
# ---------- Train model ----------------------

from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP("/content/drive/MyDrive/SAV_Project/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes',
                        default="/content/drive/MyDrive/SAV_Project/PhoBERT_base_transformers/bpe.codes",
                        required=False,
                        type=str,
                        help='path to fastBPE BPE'
                        )
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)
    # Load the dictionary
vocab = Dictionary()
vocab.add_from_file("/content/drive/MyDrive/SAV_Project/PhoBERT_base_transformers/dict.txt")




In [None]:
bpe.encode("Hôm_nay trời nóng quá nên tôi ở nhà viết Viblo!")

'Hôm_nay trời nóng quá nên tôi ở nhà viết Vi@@ blo@@ !'

In [None]:
import re
import json
from tqdm import tqdm_notebook

train_path = '/content/drive/MyDrive/SAV_Project/data/train/train.json'
test_path = '/content/drive/MyDrive/SAV_Project/data/test/test.json'

def load_data_format(filename1):
    with open(filename1, 'r') as file:
        contents = json.load(file)
        file.close()
    return contents

train_id, train_text, train_label = [], [], []
test_id, test_text = [], []

train_data = load_data_format(train_path)
train_data = train_data["reviews"]
for k in tqdm_notebook(train_data):
    train_id.append(k["id"])
    text = k["comment"]
    text = rdrsegmenter.tokenize(text)
    text = ' '.join([' '.join(x) for x in text])
    train_text.append(text)
    train_label.append(k["label"])

test_data = load_data_format(test_path)
test_data = test_data["reviews"]
for n in tqdm_notebook(test_data):
    test_id.append(n["id"])
    texts = n["comment"]
    texts = rdrsegmenter.tokenize(texts)
    texts = ' '.join([' '.join(x) for x in texts])
    test_text.append(texts)



In [None]:
train_text

In [None]:
from sklearn.model_selection import train_test_split

train_sents, val_sents, train_labels, val_labels = train_test_split(train_text, train_label, test_size=0.1)

In [None]:
val_sents

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_LEN = 150
train_ids = []
for tx in train_sents:
    subwords = '<s> ' + bpe.encode(tx) + ' </s>'
    encoded_tx = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    train_ids.append(encoded_tx)
val_ids = []
for tx in val_sents:
    subwords = '<s> ' + bpe.encode(tx) + ' </s>'
    encoded_tx = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    val_ids.append(encoded_tx)

# tao list id của các subword có trong từ điển
train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
val_ids = pad_sequences(val_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

In [None]:
 # tao mask, mask cho biet cac value nao da duoc padding
train_masks = []
for tx in train_ids:
    mask = [int(token_id > 0) for token_id in tx]
    train_masks.append(mask)
val_masks = []
for tx in val_ids:
    mask = [int(token_id > 0) for token_id in tx]
    val_masks.append(mask)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

# tao dataloader
train_inputs = torch.tensor(train_ids)
val_inputs = torch.tensor(val_ids)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

In [None]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f3b352664d0>

In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW

config = RobertaConfig.from_pretrained(
        "/content/drive/MyDrive/SAV_Project/PhoBERT_base_transformers/config.json", from_tf=False, num_labels=2, output_hidden_states=False,
    )

BERT_SA = RobertaForSequenceClassification.from_pretrained(
      "/content/drive/MyDrive/SAV_Project/PhoBERT_base_transformers/model.bin",
      config=config
    )

In [None]:
BERT_SA.cuda()
print('OK DONE...')

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    F1_score = f1_score(pred_flat, labels_flat, average='macro')
    
    return accuracy_score(pred_flat, labels_flat), F1_score

In [None]:
import random
from tqdm import tqdm_notebook
device = 'cuda'
epochs = 4
from collections import defaultdict


param_optimizer = list(BERT_SA.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)

history = defaultdict(list)
best_accuracy = 0

for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0
    BERT_SA.train()
    train_accuracy = 0
    nb_train_steps = 0
    train_f1 = 0
    if torch.cuda.is_available():
          BERT_SA.to(device)
    for step, batch in tqdm_notebook(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        BERT_SA.zero_grad()
        outputs = BERT_SA(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy, tmp_train_f1 = flat_accuracy(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        train_f1 += tmp_train_f1
        nb_train_steps += 1
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(BERT_SA.parameters(), 1.0)
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
    print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))
    train_acc = "{0:.4f}".format(train_accuracy/nb_train_steps)
    train_f1c = "{0:.4f}".format(train_f1/nb_train_steps)
    train_loss = "{0:.4f}".format(avg_train_loss)

    print("Running Validation...")
    BERT_SA.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_f1 = 0
    for batch in tqdm_notebook(val_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = BERT_SA(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1
    print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))

    val_acc = "{0:.4f}".format(eval_accuracy/nb_eval_steps)
    val_f1c = "{0:.4f}".format(eval_f1/nb_eval_steps)

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['train_f1'].append(train_f1c)
    history['val_acc'].append(val_acc)
    history['val_f1'].append(val_f1c)

    if (float(val_acc) > best_accuracy):
      torch.save(BERT_SA.state_dict(), '/content/drive/MyDrive/SAV_Project/best_model_state.bin')
      best_accuracy = val_acc
print("Training complete!") 

# ------------------ End train model ------------------------

In [8]:
# ---------------------- Predict text------------------------

from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP("/content/drive/MyDrive/SAV_Project/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse
import re
import json
import time
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW
import random
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from collections import defaultdict
from string import punctuation

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes',
                        default="/content/drive/MyDrive/SAV_Project/PhoBERT_base_transformers/bpe.codes",
                        required=False,
                        type=str,
                        help='path to fastBPE BPE'
                        )
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)
    # Load the dictionary
vocab = Dictionary()
vocab.add_from_file("/content/drive/MyDrive/SAV_Project/PhoBERT_base_transformers/dict.txt")
config = RobertaConfig.from_pretrained(
        "/content/drive/MyDrive/SAV_Project/PhoBERT_base_transformers/config.json", from_tf=False, num_labels=2, output_hidden_states=False,
    )

model = RobertaForSequenceClassification.from_pretrained(
      "/content/drive/MyDrive/SAV_Project/best_model_state.bin",
      config=config
    )
device = 'cuda'

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


In [10]:
def remove_punctuation(list_string):
    """xóa các punctuation ra khỏi từ và câu"""
    # nếu là ký tự đặc biệt thì xóa
    for token in list_string.split(' '):
        if token in punctuation:
            list_string = list_string.replace(token, '')
        return list_string

def removie_Special_characters(list_string):
    string_check = re.compile('[@_!#$%^&*()<>?/\|¥+}{~:;]')
    list_string = re.sub(string_check, ' ', list_string)
    return list_string

def normalizeString(list_strings):
    """Tách dấu ra khỏi từ"""
    str = list_strings.lower()
    # Tách dấu câu nếu kí tự liền nhau
    marks = '[.!?,-${}()]'
    r = "([" + "\\".join(marks) + "])"
    str = re.sub(r, r" \1 ", str)
    # Thay thế nhiều spaces bằng 1 space.
    str = re.sub(r"\s+", r" ", str).strip()
    return str

def strip_emoji(text):
    """Xóa icons"""
    RE_EMOJI = re.compile(
        u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])|([\U0001F1E0-\U0001F1FF])|([\U0001F300-\U0001F5FF])|([\U0001F600-\U0001F64F])|([\U0001F680-\U0001F6FF])|([\U0001F700-\U0001F77F])|([\U0001F780-\U0001F7FF])|([\U0001F800-\U0001F8FF])|([\U0001F900-\U0001F9FF])|([\U0001FA00-\U0001FA6F])|([\U0001FA70-\U0001FAFF])|([\U00002702-\U000027B0])')
    return RE_EMOJI.sub(r'', text)

def remove_duplicate_characters(list_string):
    """Xóa các ký tự kéo dài: vd: đẹppppppp"""
    list_string = re.sub(r'([A-Z])\1+', lambda m: m.group(1), list_string, flags=re.IGNORECASE)
    return list_string

def remove_char_spams(list_string):
        for token in list_string.split(' '):
            if (len(token) > 8):
                list_string = list_string.replace(token, '')
        return list_string

def replace_abbreviations(text):
        """Chuẩn hóa từ"""
        text = remove_duplicate_characters(text)
        replace_list = {
            ' ô kêi ': ' ok ', ' okie ': ' ok ', ' o kê ': ' ok ',
            ' okey ': ' ok ', ' ôkê ': ' ok ', ' oki ': ' ok ', ' oke ': ' ok ', ' okay ': ' ok ', ' okê ': ' ok ',
            ' tks ': u' cám ơn ', ' thks ': u' cám ơn ', ' thanks ': u' cám ơn ', ' ths ': u' cám ơn ',
            ' thank ': u' cám ơn ',
            ' not ': u' không ', u' kg ': u' không ', ' kh ': u' không ', ' kp ': u' không phải ', u' kô ': u' không ',
            '"ko ': u' không ', u' ko ': u' không ', u' k ': u' không ', ' khong ': u' không ', u' hok ': u' không ',
            ' he he ': ' positive ', ' hehe ': ' positive ', ' hihi ': ' positive ', ' haha ': ' positive ',
            ' hjhj ': ' positive ',
            ' lol ': ' nagative ', ' cc ': ' nagative ', ' cute ': u' dễ thương ', ' huhu ': ' nagative ', ' vs ': u' với ',
            ' wa ': ' quá ', ' wá ': u' quá ', ' j ': u' gì ', '“': ' ',
            ' sz ': u' cỡ ', ' size ': u' cỡ ', u' đx ': u' được ', ' dc ': u' được ', ' đk ': u' được ',
            ' đc ': u' được ', ' authentic ': u' chuẩn chính hãng ', u' aut ': u' chuẩn chính hãng ',
            u' auth ': u' chuẩn chính hãng ', ' thick ': u' positive ', ' store ': u' cửa hàng ',
            ' shop ': u' cửa hàng ', 'sp': u' sản phẩm ', 'gud': u' tốt ', ' god ': u' tốt ', ' wel done ': ' tốt ',
            ' good ': u' tốt ', ' gút ': u' tốt ',
            ' sấu ': u' xấu ', ' gut ': u' tốt ', u' tot ': u' tốt ', u' nice ': u' tốt ', ' perfect ': ' rất tốt ',
            ' bt ': u' bình thường ', ' bthg ': u' bình thường ', ' thg ': u' thường ',
            ' time ': u' thời gian ', 'qá': u' quá ', u' ship ': u' giao ', u' m ': u' mình ', u' mik ': u' mình ',
            ' ê ̉': 'ể', 'product': 'sản phẩm', ' quality ': ' chất lượng ', ' chat ': ' chất ', ' excelent ': ' hoàn hảo ',
            ' bad ': ' tệ ', ' fresh ': ' tươi ', ' sad ': ' tệ ',
            ' date ': u' hạn sử dụng ', ' hsd ': u' hạn sử dụng ', ' quickly ': u' nhanh ', ' quick ': u' nhanh ',
            ' fast ': u' nhanh ', ' delivery ': u' giao hàng ', u' síp ': u' giao hàng ', ' shiper ': u' người giao hàng ',
            ' beautiful ': u' đẹp tuyệt vời ', u' tl ': u' trả lời ', u' r ': u' rồi ', u' shopE ': u' cửa hàng ',
            u' order ': u' đặt hàng ',
            ' chất lg ': u' chất lượng ', u' sd ': u' sử dụng ', u' dt ': u' điện thoại ', u' nt ': u' nhắn tin ',
            u' sài ': u' xài ', u' bjo ': u' bao giờ ',
            ' thik ': u' thích ', u' sop ': u' cửa hàng ', ' fb ': ' facebook ', ' face ': ' facebook ', ' very ': u' rất ',
            u' quả ng ': u' quảng ', ' tc ': u' tính chất ',
            ' dep ': u' đẹp ', u' xau ': u' xấu ', ' delicious ': u' ngon ', u' hàg ': u' hàng ', u' qủa ': u' quả ',
            ' iu ': u' yêu ', ' fake ': u' giả mạo ', ' trl ': ' trả lời ', ' >< ': u' positive ',
            ' por ': u' tệ ', ' poor ': u' tệ ', ' ib ': u' nhắn tin ', ' rep ': u' trả lời ', u' fback ': ' feedback ',
            ' fedback ': ' feedback ', ' tg ': u' thời gian ', ' sp ': u' sản phẩm ', ' cg ': u' cũng ',
            ' mk ': u' mình ', ' vs ': u' với ', ' qc ': u' quảng cáo ', ' mng ': u' mọi người ', ' mn ': u' mọi người ',
            ' kb ': u' không biết ', ' e ': u' em ', ' ak ': u' à ', ' bh ': u' bao giờ ', ' bn ': u' bao nhiêu ',
            ' cỡ mình ': u' cỡ M ', ' ntn ': u' như thế nào ', ' z ': u' vậy ', u' nhìu ': u' nhiều ', ' ah ': u' à ',
            ' vlin ': ' negative ', ' vl ': ' negative ', ' vch ': ' negative ', ' vcl ': ' negative ', ' cmt ': u' bình luận '
        }

        for k, v in replace_list.items():
            text = text.replace(k, v)
            # xóa số  ra khỏi chuỗi
        text = re.sub(r"\d+", "", text)

        return text



def normalize_text(list_string):
    list_string = list_string.lower().replace('\n', ' ')
    list_string = removie_Special_characters(list_string)
    list_string = strip_emoji(list_string)
    list_string = remove_duplicate_characters(list_string)
    list_string = remove_char_spams(list_string)
    list_string = replace_abbreviations(list_string)
    list_string = remove_punctuation(list_string)
    list_string = normalizeString(list_string)
    return list_string


def prepare_text(list_string = [], *args):
  MAX_LEN = 150

  text_ids = []
  for tx in list_string:
    tx = normalize_text(tx)
    text_tokenizer = rdrsegmenter.tokenize(tx)
    texts_tk = ' '.join([' '.join(x) for x in text_tokenizer])
    subwords = '<s> ' + bpe.encode(texts_tk) + ' </s>'
    encoded_tx = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()

    text_ids.append(encoded_tx)

  text_ids = pad_sequences(text_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

  text_masks = []
  for tx in text_ids:
    mask = [int(token_id > 0) for token_id in tx]
    text_masks.append(mask)

  if torch.cuda.is_available():
    model.to(device)
  input_ids = torch.tensor(text_ids).to(device)
  input_masks = torch.tensor(text_masks).to(device)

  return input_ids, input_masks

def predict_text(list_string = [], *args):
  start_time = time.time()
  input_ids, input_masks = prepare_text(list_string)
  output = model(input_ids, token_type_ids=None, attention_mask = input_masks)
  print(output)

  print(f'Review text: {review_text}')
  logits = output[0]
  probs = torch.softmax(logits, dim=1).detach().cpu().numpy()
  end_time = time.time()
  print(probs)
  time_pre = end_time - start_time
  print('Duration: {}'.format(end_time - start_time))
  return probs, time_pre


review_text = ["sản phẩm dùng tốt nha mọi người", "sản phẩm xấu và giao hàng chậm", "hàng đẹp nhưng dễ vỡ, giao hàng chậm"]
pre, time_pre = predict_text(review_text)


SequenceClassifierOutput(loss=None, logits=tensor([[ 2.3141, -2.5678],
        [-2.6853,  2.7849],
        [-2.0787,  2.1960]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
Review text: ['sản phẩm dùng tốt nha mọi người', 'sản phẩm xấu và giao hàng chậm', 'hàng đẹp nhưng dễ vỡ, giao hàng chậm']
[[0.992475   0.00752502]
 [0.0041927  0.9958073 ]
 [0.01372508 0.98627496]]
Duration: 0.05650162696838379


In [None]:
!pip install flask-ngrok

In [None]:
from flask import *
from flask_ngrok import run_with_ngrok
import itertools

app = Flask(__name__)
run_with_ngrok(app)
@app.route('/')
def home():
  return 'Hello World'

@app.route('/predict', methods=['POST', 'GET'])
def predict():
  if (request.method == 'POST'):
    output = []
    data = request.form['data']
    data_pred, time_pred = predict_text(data)
    for (score, tx) in zip(data_pred, data):
      if (score[0] > score[1]):
        pre_db = {
              "comment": tx,
              "label": 0
          }
        output.append(pre_db)
      else:
          pre_db = {
              "comment": tx,
              "label": 1
        }
      output.append(pre_db)
    db_pred = {"time_pred": time_pred, "reviews": output}
    return jsonify(db_pred)
  else:
      return jsonify({'message': 'Vui lòng chọn phương thức post...Amen....'})


app.run()