# IDIOM IDENTIFICATION

# IMPORT LIBRARY

In [None]:
!pip install -q -U watermark

In [None]:
!pip install -qq transformers

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

## Model Identifikasi

In [None]:
import re
import string
from torch import clamp
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

MODEL_NAME = 'cahya/bert-base-indonesian-522M'

class TokenSimilarity:

    def __init__(self, from_pretrained:str=MODEL_NAME):
        self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
        self.model = AutoModel.from_pretrained(from_pretrained)
        
    def __process(self, first_token:str, second_token:str):
        inputs = self.tokenizer([first_token, second_token],
                                max_length=self.max_length,
                                truncation=self.truncation,
                                padding=self.padding,
                                return_tensors='pt')

        attention = inputs.attention_mask
        outputs = self.model(**inputs)
        embeddings = outputs[0]
        mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
        masked_embeddings = embeddings * mask
        
        summed = masked_embeddings.sum(1)
        counts = clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed/counts

        return mean_pooled.detach().numpy()
        
    def predict(self, first_token:str, second_token:str, max_length:int=40,
                truncation:bool=True, padding:str="max_length"):
        self.max_length = max_length
        self.truncation = truncation
        self.padding = padding

        mean_pooled_arr = self.__process(first_token, second_token)
        similarity = cosine_similarity([mean_pooled_arr[0]], [mean_pooled_arr[1]])

        return similarity

In [None]:
from torch._C import Stream
import torch
import dill
from nltk.tokenize import WordPunctTokenizer
import nltk
import math
import string
import pandas as pd
import re
import transformers
from transformers import BertModel, BertTokenizer
from torch import nn

MODEL_NAME = 'cahya/bert-base-indonesian-522M'

class  BertClassifier(nn.Module):

  def __init__(self, n_classes, dropout=0.3):
    super(BertClassifier, self).__init__()
    self.bert = self.bert = BertModel.from_pretrained(MODEL_NAME)
    self.drop = nn.Dropout(p=dropout)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask, token_type_ids):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids,
      return_dict=False
    )
    output = self.drop(pooled_output)
    logits = self.out(output)
    classifier = torch.nn.functional.softmax(logits, dim=1)
    _, pred = torch.max(classifier, dim=1)
    return logits, pred


class IdiomIdentification():

  def __init__(self):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.class_names = ['kalimat_biasa', 'kalimat_idiom']
    self.tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    self.classification_model = BertClassifier(len(self.class_names))
    self.classification_model.load_state_dict(torch.load('/content/drive/MyDrive/model/classification.bin'))
    self.classification_model = self.classification_model.to(self.device)
    self.hmm_tagger_model = dill.load(open('/content/drive/MyDrive/model/tagger_model.dill', 'rb'))
    self.similarity_model = torch.load('/content/drive/MyDrive/model/similarity.bin')
    self.truth_discovery_model = dill.load(open('/content/drive/MyDrive/model/truth_discovery.dill', 'rb'))
    self.idiom_example_df = pd.read_csv("/content/idiom-example.csv")
  
  def text_preprocessing(self, kalimat, remove_punctuation=False, tokenization=False, lowercase=False):
    if (remove_punctuation):
      punc = '''!()-[]{};:'"\<>/?@#$%^&*_~'''
      kalimat = kalimat.translate(str.maketrans('', '', punc))
      kalimat = re.sub(r'/s+', ' ', kalimat).strip()
    
    if (tokenization):
      word_punct_tokenizer = WordPunctTokenizer()
      kalimat = word_punct_tokenizer.tokenize(kalimat)

    if (lowercase):
      kalimat = str.lower(kalimat)

    return kalimat

  def idiom_sentence_classification(self, kalimat):
    encoded_text = self.tokenizer.encode_plus(
      kalimat,
      max_length=40,
      add_special_tokens=True,
      return_token_type_ids=True,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    input_ids = encoded_text['input_ids'].to(self.device)
    attention_mask = encoded_text['attention_mask'].to(self.device)
    token_type_ids = encoded_text['token_type_ids'].to(self.device)

    output, pred = self.classification_model(input_ids, attention_mask, token_type_ids)

    kategori = self.class_names[pred]

    return kategori
  
  def hasNumbers(self, inputString):
    result = False
    for char in list(inputString):
        if(char.isdigit()):
            result = True
    return result

  def check_tag(self, word, tag):
    punc = list(string.punctuation)
    punc.append('.')
    punc.append(',')
    punc.append('"')
    punc.append("'")
    
    dates = ['Januari','Februari','Maret',\
             'April','Mei','Juni','Juli','Agustus',\
             'September','Oktober','November','Desember',\
            'Jan','Feb','Mar','Apr','Mei',\
             'Jun','Jul','Agt','Sep','Okt','Nov','Des',\
            'januari','februari','maret','april',\
             'mei','juni','juli','agustus',\
             'september','oktober','november','desember',\
            'Senin','Selasa','Rabu','Kamis','Jumat','Sabtu','Minggu'
        ]
    
    if(word in dates):
        tag = 'DATE'
    
    if(word in punc):
        tag = 'Z'
        
    if(tag == 'CD' and word.isdigit()):
        tag = 'CD'
        
    if(tag in ['SYM','Z','CD','MD'] and word.upper() != word and self.hasNumbers(word) == False \
      and word[-3:] not in ['nya','kah','lah']):
        tag = 'NNP'
    
    if(tag == 'NN' and word[:1].upper() == word):
        tag = 'NNP'
        
    if(tag == 'NNP' and word.lower() == word):
        tag = 'NN'
    
    if(tag == 'NNP' and len(word) == 1):
        tag = 'NN'
        
    if(tag == 'FW' and word.lower() == word):
        tag = 'NN'
        
    return word, tag

  def pos_tagging(self, kalimat):
    kalimat_token = self.text_preprocessing(kalimat, tokenization=True)
    tagging = self.hmm_tagger_model.tag(kalimat_token)
    final_tag = []
    for pt in tagging:
      w,t = self.check_tag(pt[0], pt[1])
      final_tag.append((w,t))

    return final_tag

  def chunking(self, kalimat_tagged):
    grammar = ["CHUNK: {<NN>{2,}}", "CHUNK: {<NN><JJ>}", 
               "CHUNK: {<NN><VB>}", "CHUNK: {<CD><NN>}", 
               "CHUNK: {<VB><VB>}", "CHUNK: {<VB><NN>}",
               "CHUNK: {<VB><JJ>}", "CHUNK: {<VB><CD>}",
               "CHUNK: {<JJ><NN>}", "CHUNK: {<JJ><JJ>}"]
    
    frasa_kandidat = []

    for i in grammar:
      cp = nltk.RegexpParser(i)
      result = cp.parse(kalimat_tagged)

      leaves = [chunk.leaves() for chunk in result \
                if ((type(chunk) == nltk.tree.Tree) and \
                    chunk.label() == 'CHUNK')]
      bigram_groups = [list(nltk.bigrams([w for w, t in leaf])) \
                       for leaf in leaves]

      fr = [' '.join(w) for group in bigram_groups for w in group]
      frasa_kandidat = frasa_kandidat + fr

    return frasa_kandidat

  def count_score_similarity(self, frasa):
    token = frasa.split()
    similarity_score = self.similarity_model.predict(token[0], token[1])[0][0]

    random = self.idiom_example_df.sample(n = 5)
    idiom_example = random['idiom_example'].values
    
    for idiom in idiom_example:
      similarity_score = similarity_score + self.similarity_model.predict(frasa, idiom)[0][0]

    return similarity_score

  def similarity(self, frasa):
    frasa_pred = []
    if len(frasa) == 0:
      frasa_pred = []
    else:
      for f in frasa:
        sim_score = self.count_score_similarity(f)
        if sim_score > 0.5:
          frasa_pred.append(f)

    return frasa_pred

  def validasi(self, frasa):
    frasa_idiom = []
    for f in frasa:
      f = self.text_preprocessing(f, lowercase=True)
      kategori = self.truth_discovery_model.predict([f])[0]
      if kategori == 1:
        frasa_idiom.append(f)
    
    return frasa_idiom

  def _predict(self, kalimat):
    kalimat = self.text_preprocessing(kalimat, remove_punctuation=True)
    klasifikasi = self.idiom_sentence_classification(kalimat)
    
    if (klasifikasi == 'kalimat_biasa'):
      frasa = 'none'
      hasil = kalimat, klasifikasi, frasa
    else:
      postag = self.pos_tagging(kalimat)
      frasa_chunk = self.chunking(postag)
      frasa_pred = self.similarity(frasa_chunk)
      frasa_idiom = self.validasi(frasa_pred)
      if len(frasa_idiom) == 0:
        frasa = 'none'
      else:
        frasa = frasa_idiom[0]
      hasil = kalimat, klasifikasi, frasa

    return hasil

  def predict(self, X):
    predicted_result = [self._predict(x) for x in X]
    return predicted_result

In [None]:
import pandas as pd
df = pd.read_csv("/content/idiom-ta-kalimat-dataset.csv", encoding = 'unicode_escape')
df.head()

Unnamed: 0,kalimat,kategori,frasa idiom,validasi,sumber
0,Orang tua itu rela membanting tulang demi meny...,kalimat_idiom,membanting tulang,idiom,https://tirto.id/pengertian-idiom-dalam-bahasa...
1,Rusmi jadi buah bibir setelah menjuarai lomba ...,kalimat_idiom,buah bibir,idiom,https://www.medcom.id/pendidikan/news-pendidik...
2,Gara-gara pandemi covid-19 usaha Doyok harus g...,kalimat_idiom,gulung tikar,idiom,https://www.medcom.id/pendidikan/news-pendidik...
3,Saat pandemi ini Esti harus banting tulang unt...,kalimat_idiom,banting tulang,idiom,https://www.medcom.id/pendidikan/news-pendidik...
4,"Karena pandemi Covid-19, restoran Pak Hilman a...",kalimat_idiom,gulung tikar,idiom,https://xerpihan.id/blog/770/apa-itu-idiom-pen...


In [None]:
model = IdiomIdentification()

# PENGUJIAN

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

list_n_data = []
 
for n in range(1,11):
    list_n_data.append(n*200)

acc = []
precission = []
recall = []
f1score = []

for n_data in list_n_data:

  data = df.sample(n = n_data)
  X_kalimat = data['kalimat'].values.tolist()
  y_frasa = data['frasa idiom'].values.tolist()
  y_validasi = data['validasi'].values.tolist()
  y_validasi = [1 if i=='idiom' else 0 for i in y_validasi]

  predictions = model.predict(X_kalimat)

  idiom_predicted = [predictions[x][2] for x in range(len(predictions))]
  y_predicted = ['idiom' if y_frasa[i]==idiom_predicted[i] and idiom_predicted[i]!='none'  else 'bukan_idiom' for i in range(len(idiom_predicted))]
  y_predicted = [1 if i=='idiom' else 0 for i in y_predicted]

  val_acc = accuracy_score(y_validasi, y_predicted)
  acc.append(val_acc)

  val_precission = precision_score(y_validasi, y_predicted)
  precission.append(val_precission)

  val_recall = recall_score(y_validasi, y_predicted)
  recall.append(val_recall)

  val_f1score = f1_score(y_validasi, y_predicted)
  f1score.append(val_f1score)

  print('-' * 10)
  print(f'jumlah data {n_data}')
  print(f'accuracy {val_acc}')
  print(f'precission {val_precission}')
  print(f'recall {val_recall}')
  print(f'f1score {val_f1score}')
  print(pd.DataFrame(confusion_matrix(y_validasi, y_predicted), index=label, columns=label))

mean_acc = sum(acc) / len(acc)
mean_precission = sum(precission) / len(precission)
mean_recall = sum(recall) / len(recall)
mean_f1score = sum(f1score) / len(f1score)

print('-' * 10)
print(f"Mean-Accuracy: {mean_acc}")
print(f"Mean-Precision: {mean_precission}")
print(f"Mean-Recall: {mean_recall}")
print(f"Mean-F1score: {mean_f1score}")

----------
jumlah data 200
accuracy 0.835
precission 1.0
recall 0.67
f1score 0.8023952095808384
             bukan_idiom  idiom
bukan_idiom          100      0
idiom                 33     67
----------
jumlah data 400
accuracy 0.8225
precission 1.0
recall 0.6243386243386243
f1score 0.7687296416938111
             bukan_idiom  idiom
bukan_idiom          211      0
idiom                 71    118
----------
jumlah data 600
accuracy 0.8266666666666667
precission 1.0
recall 0.6612377850162866
f1score 0.7960784313725491
             bukan_idiom  idiom
bukan_idiom          293      0
idiom                104    203
----------
jumlah data 800
accuracy 0.82375
precission 1.0
recall 0.6356589147286822
f1score 0.7772511848341233
             bukan_idiom  idiom
bukan_idiom          413      0
idiom                141    246
----------
jumlah data 1000
accuracy 0.822
precission 1.0
recall 0.6576923076923077
f1score 0.7935034802784223
             bukan_idiom  idiom
bukan_idiom          480      0

## Hasl Pengujian

In [None]:
indeks = list(range(1, 11))

jml_data_df = pd.DataFrame(list_n_data, index=indeks, columns=['Jumlah Data'])
acc_df = pd.DataFrame(acc, index=indeks, columns=['Accuracy'])
precission_df = pd.DataFrame(precission, index=indeks, columns=['Precission'])
recall_df = pd.DataFrame(recall, index=indeks, columns=['Recall'])
f1score_df = pd.DataFrame(f1score, index=indeks, columns=['F1-score'])

list_mean = [['', mean_acc, mean_precission, mean_recall, mean_f1score]]
mean_df = pd.DataFrame(list_mean,index =['Rata-Rata'], columns =['Jumlah Data','Accuracy','Precission','Recall', 'F1-score'])

evaluasi = pd.concat([jml_data_df, acc_df, precission_df, recall_df, f1score_df], axis=1)
evaluasi_fix = pd.concat([evaluasi, mean_df], axis=0)
evaluasi_fix

Unnamed: 0,Jumlah Data,Accuracy,Precission,Recall,F1-score
1,200.0,0.835,1.0,0.67,0.802395
2,400.0,0.8225,1.0,0.624339,0.76873
3,600.0,0.826667,1.0,0.661238,0.796078
4,800.0,0.82375,1.0,0.635659,0.777251
5,1000.0,0.822,1.0,0.657692,0.793503
6,1200.0,0.824167,1.0,0.649502,0.787513
7,1400.0,0.83,1.0,0.661932,0.796581
8,1600.0,0.82375,1.0,0.64794,0.786364
9,1800.0,0.818889,1.0,0.642544,0.782377
10,2000.0,0.821,1.0,0.642,0.781973


## Predict Raw Text

In [None]:
hasil = model.predict(['Andi dikenal sebagai kutu buku.', 'Andi selalu menjadi anak bawang di kelasnya.'])

In [None]:
print(hasil)

[('Andi dikenal sebagai kutu buku.', 'kalimat_idiom', 'kutu buku'), ('Andi selalu menjadi anak bawang di kelasnya.', 'kalimat_idiom', 'anak bawang')]
