# IDIOM IDENTIFICATION

In [None]:
!pip install -q -U watermark

In [None]:
!pip install -qq transformers

In [11]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

Python implementation: CPython
Python version       : 3.7.12
IPython version      : 5.5.0

numpy       : 1.21.5
pandas      : 1.3.5
torch       : 1.10.0+cu111
transformers: 4.17.0



In [29]:
idiom_example_df = pd.read_csv("/content/idiom-example.csv")
idiom_example_df.head()

Unnamed: 0,idiom_example
0,membanting tulang
1,buah bibir
2,gulung tikar
3,banting tulang
4,jago merah


In [30]:
random = idiom_example_df.sample(n = 5)
idiom_example = random['idiom_example'].values
idiom_example

array(['lintah darat', 'bunga tidur', 'tunas bangsa', 'mental baja',
       'gaji buta'], dtype=object)

In [31]:
for idiom in idiom_example:
  print(idiom)

lintah darat
bunga tidur
tunas bangsa
mental baja
gaji buta


In [116]:
from torch._C import Stream
import torch
import dill
from nltk.tokenize import WordPunctTokenizer
import nltk
import math
import string
import pandas as pd
import re
import transformers
from transformers import BertModel, BertTokenizer
from torch import nn

class TextClassifier(nn.Module):

  def __init__(self, n_classes, dropout=0.3):
    super(TextClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('cahya/bert-base-indonesian-522M')
    self.drop = nn.Dropout(p=dropout)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)



class IdiomIdentification():

  def __init__(self):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.class_names = ['kalimat_biasa', 'kalimat_idiom']
    self.tokenizer = BertTokenizer.from_pretrained('cahya/bert-base-indonesian-522M')
    self.classification_model = TextClassifier(len(self.class_names))
    self.classification_model.load_state_dict(torch.load('/content/drive/MyDrive/model/classification.bin'))
    self.classification_model = self.classification_model.to(self.device)
    self.hmm_tagger_model = dill.load(open('/content/tagger_model.dill', 'rb'))
    self.similarity_model = torch.load('/content/drive/MyDrive/model/word_sim.bin')
    self.truth_discovery_model = dill.load(open('/content/truth_discovery.dill', 'rb'))
    self.idiom_example_df = pd.read_csv("/content/idiom-example.csv")
  
  def preprocessing(self, kalimat, remove_punctuation=False, tokenization=False, lowercase=False):
    if (remove_punctuation):
      punc = '''!()-[]{};:'"\<>/?@#$%^&*_~'''
      kalimat = kalimat.translate(str.maketrans('', '', punc))
      kalimat = re.sub(r'/s+', ' ', kalimat).strip()
    
    if (tokenization):
      word_punct_tokenizer = WordPunctTokenizer()
      kalimat = word_punct_tokenizer.tokenize(kalimat)

    if (lowercase):
      kalimat = str.lower(kalimat)

    return kalimat

  def idiom_classification(self, kalimat):
    encoded_text = self.tokenizer.encode_plus(
      kalimat,
      max_length=40,
      add_special_tokens=True,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    input_ids = encoded_text['input_ids'].to(self.device)
    attention_mask = encoded_text['attention_mask'].to(self.device)

    output = self.classification_model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)

    kategori = self.class_names[prediction]

    return kategori
  
  def hasNumbers(self, inputString):
    result = False
    for char in list(inputString):
        if(char.isdigit()):
            result = True
    return result

  def check_tag(self, word, tag):
    punc = list(string.punctuation)
    punc.append('.')
    punc.append(',')
    punc.append('"')
    punc.append("'")
    
    dates = ['Januari','Februari','Maret','April','Mei','Juni','Juli','Agustus','September','Oktober','November','Desember',\
            'Jan','Feb','Mar','Apr','Mei','Jun','Jul','Agt','Sep','Okt','Nov','Des',\
            'januari','februari','maret','april','mei','juni','juli','agustus','september','oktober','november','desember',\
            'Senin','Selasa','Rabu','Kamis','Jumat','Sabtu','Minggu'
        ]
    
    if(word in dates):
        tag = 'DATE'
    
    if(word in punc):
        tag = 'Z'
        
    if(tag == 'CD' and word.isdigit()):
        tag = 'CD'
        
    if(tag in ['SYM','Z','CD','MD'] and word.upper() != word and self.hasNumbers(word) == False \
      and word[-3:] not in ['nya','kah','lah']):
        tag = 'NNP'
    
    if(tag == 'NN' and word[:1].upper() == word):
        tag = 'NNP'
        
    if(tag == 'NNP' and word.lower() == word):
        tag = 'NN'
    
    if(tag == 'NNP' and len(word) == 1):
        tag = 'NN'
        
    if(tag == 'FW' and word.lower() == word):
        tag = 'NN'
        
    return word, tag

  def pos_tagging(self, kalimat):
    kalimat_token = self.preprocessing(kalimat, tokenization=True)
    tagging = self.hmm_tagger_model.tag(kalimat_token)
    final_tag = []
    for pt in tagging:
      w,t = self.check_tag(pt[0], pt[1])
      final_tag.append((w,t))

    return final_tag

  def chunking(self, kalimat_tagged):
    grammar = ["CHUNK: {<NN>{2,}}", "CHUNK: {<NN><CD>}", "CHUNK: {<CD><NN>}", "CHUNK: {<NNP><NN>}", "CHUNK: {<VB><NN>}",
               "CHUNK: {<VB><JJ>}", "CHUNK: {<VB><CD>}", "CHUNK: {<JJ><NN>}", "CHUNK: {<NN><JJ>}", "CHUNK: {<JJ><JJ>}"]
    
    extract = []

    for i in grammar:
      cp = nltk.RegexpParser(i)
      result = cp.parse(kalimat_tagged)

      leaves = [chunk.leaves() for chunk in result if ((type(chunk) == nltk.tree.Tree) and chunk.label() == 'CHUNK')]
      noun_bigram_groups = [list(nltk.bigrams([w for w, t in leaf])) for leaf in leaves]

      ph = [' '.join(nouns) for group in noun_bigram_groups for nouns in group]
      extract = extract + ph

    return extract

  def count_score_similarity(self, frasa):
    token = frasa.split()
    similarity_score = self.similarity_model.predict(token[0], token[1])[0][0]

    random = self.idiom_example_df.sample(n = 5)
    idiom_example = random['idiom_example'].values
    
    for idiom in idiom_example:
      similarity_score = similarity_score + self.similarity_model.predict(frasa, idiom)[0][0]

    return similarity_score

  def similarity(self, frasa):
    frasa_pred = []
    if len(frasa) == 0:
      frasa_pred = []
    else:
      for f in frasa:
        sim_score = self.count_score_similarity(f)
        if sim_score > 0.5:
          frasa_pred.append(f)

    return frasa_pred

  def validasi(self, frasa):
    frasa_idiom = []
    for f in frasa:
      f = self.preprocessing(f, lowercase=True)
      kategori = self.truth_discovery_model.predict([f])[0]
      if kategori == 1:
        frasa_idiom.append(f)
    
    return frasa_idiom

  def _predict(self, kalimat):
    kalimat = self.preprocessing(kalimat, remove_punctuation=True)
    klasifikasi = self.idiom_classification(kalimat)
    
    if (klasifikasi == 'kalimat_biasa'):
      frasa = 'none'
      hasil = kalimat, klasifikasi, frasa
    else:
      postag = self.pos_tagging(kalimat)
      frasa_chunk = self.chunking(postag)
      frasa_pred = self.similarity(frasa_chunk)
      frasa_idiom = self.validasi(frasa_pred)
      if len(frasa_idiom) == 0:
        frasa = 'none'
      else:
        frasa = frasa_idiom[0]
      hasil = kalimat, klasifikasi, frasa

    return hasil

  def predict(self, X):
    predicted_result = [self._predict(x) for x in X]
    return predicted_result

In [74]:
import pandas as pd
df = pd.read_csv("/content/idiom-ta-kalimat-dataset.csv", encoding = 'unicode_escape')
df.head()

Unnamed: 0,kalimat,kategori,frasa idiom,validasi,sumber
0,Orang tua itu rela membanting tulang demi meny...,kalimat_idiom,membanting tulang,idiom,https://tirto.id/pengertian-idiom-dalam-bahasa...
1,Rusmi jadi buah bibir setelah menjuarai lomba ...,kalimat_idiom,buah bibir,idiom,https://www.medcom.id/pendidikan/news-pendidik...
2,Gara-gara pandemi covid-19 usaha Doyok harus g...,kalimat_idiom,gulung tikar,idiom,https://www.medcom.id/pendidikan/news-pendidik...
3,Saat pandemi ini Esti harus banting tulang unt...,kalimat_idiom,banting tulang,idiom,https://www.medcom.id/pendidikan/news-pendidik...
4,"Karena pandemi Covid-19, restoran Pak Hilman a...",kalimat_idiom,gulung tikar,idiom,https://xerpihan.id/blog/770/apa-itu-idiom-pen...


In [None]:
model = IdiomIdentification()

Evaluasi

In [126]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

list_n_data = []
 
for n in range(1,11):
    list_n_data.append(n*200)

acc = []
precission = []
recall = []
f1score = []

for n_data in list_n_data:

  data = df.sample(n = n_data)
  X_kalimat = data['kalimat'].values.tolist()
  y_frasa = data['frasa idiom'].values.tolist()
  y_validasi = data['validasi'].values.tolist()
  y_validasi = [1 if i=='idiom' else 0 for i in y_validasi]

  predictions = model.predict(X_kalimat)

  idiom_predicted = [predictions[x][2] for x in range(len(predictions))]
  y_predicted = ['idiom' if y_frasa[i]==idiom_predicted[i] and idiom_predicted[i]!='none'  else 'bukan_idiom' for i in range(len(idiom_predicted))]
  y_predicted = [1 if i=='idiom' else 0 for i in y_predicted]

  val_acc = accuracy_score(y_validasi, y_predicted)
  acc.append(val_acc)

  val_precission = precision_score(y_validasi, y_predicted)
  precission.append(val_precission)

  val_recall = recall_score(y_validasi, y_predicted)
  recall.append(val_recall)

  val_f1score = f1_score(y_validasi, y_predicted)
  f1score.append(val_f1score)

  print('-' * 10)
  print(f'accuracy {val_acc}')
  print(f'precission {val_precission}')
  print(f'recall {val_recall}')
  print(f'f1score {val_f1score}')


mean_acc = sum(acc) / len(acc)
mean_precission = sum(precission) / len(precission)
mean_recall = sum(recall) / len(recall)
mean_f1score = sum(f1score) / len(f1score)

print('-' * 10)
print(f"Mean-Accuracy: {mean_acc}")
print(f"Mean-Precision: {mean_precission}")
print(f"Mean-Recall: {mean_recall}")
print(f"Mean-F1score: {mean_f1score}")

----------
accuracy 0.81
precission 1.0
recall 0.62
f1score 0.7654320987654321
----------
accuracy 0.81
precission 1.0
recall 0.6161616161616161
f1score 0.7625
----------
accuracy 0.7983333333333333
precission 1.0
recall 0.6134185303514377
f1score 0.7603960396039604
----------
accuracy 0.805
precission 1.0
recall 0.6276849642004774
f1score 0.7712609970674487
----------
accuracy 0.806
precission 1.0
recall 0.6088709677419355
f1score 0.7568922305764411
----------
accuracy 0.8125
precission 1.0
recall 0.6179966044142614
f1score 0.763903462749213
----------
accuracy 0.8142857142857143
precission 1.0
recall 0.6198830409356725
f1score 0.7653429602888087
----------
accuracy 0.809375
precission 1.0
recall 0.6158690176322418
f1score 0.7622759158222914
----------
accuracy 0.8133333333333334
precission 1.0
recall 0.6254180602006689
f1score 0.7695473251028807
----------
accuracy 0.8115
precission 1.0
recall 0.623
f1score 0.7677141096734442
----------
Mean-Accuracy: 0.8090327380952381
Mean-Precisio

In [127]:
indeks = list(range(1, 11))

jml_data_df = pd.DataFrame(list_n_data, index=indeks, columns=['Jumlah Data'])
acc_df = pd.DataFrame(acc, index=indeks, columns=['Accuracy'])
precission_df = pd.DataFrame(precission, index=indeks, columns=['Precission'])
recall_df = pd.DataFrame(recall, index=indeks, columns=['Recall'])
f1score_df = pd.DataFrame(f1score, index=indeks, columns=['F1-score'])

list_mean = [['', mean_acc, mean_precission, mean_recall, mean_f1score]]
mean_df = pd.DataFrame(list_mean,index =['Rata-Rata'], columns =['Jumlah Data','Accuracy','Precission','Recall', 'F1-score'])

evaluasi = pd.concat([jml_data_df, acc_df, precission_df, recall_df, f1score_df], axis=1)
evaluasi_fix = pd.concat([evaluasi, mean_df], axis=0)
evaluasi_fix

Unnamed: 0,Jumlah Data,Accuracy,Precission,Recall,F1-score
1,200.0,0.81,1.0,0.62,0.765432
2,400.0,0.81,1.0,0.616162,0.7625
3,600.0,0.798333,1.0,0.613419,0.760396
4,800.0,0.805,1.0,0.627685,0.771261
5,1000.0,0.806,1.0,0.608871,0.756892
6,1200.0,0.8125,1.0,0.617997,0.763903
7,1400.0,0.814286,1.0,0.619883,0.765343
8,1600.0,0.809375,1.0,0.615869,0.762276
9,1800.0,0.813333,1.0,0.625418,0.769547
10,2000.0,0.8115,1.0,0.623,0.767714


In [76]:
hasil = model.predict(['Andi dikenal sebagai kutu buku.', 'Andi selalu menjadi anak bawang di kelasnya.'])

In [77]:
print(hasil)

[('Andi dikenal sebagai kutu buku.', 'kalimat_idiom', 'kutu buku'), ('Andi selalu menjadi anak bawang di kelasnya.', 'kalimat_idiom', 'anak bawang')]


In [None]:
frasa = [hasil[x][2] for x in range(len(hasil))]
frasa

['kutu buku', 'anak bawang']

In [118]:
df1 = df.sample(n = 10)

In [119]:
X_kalimat = df1['kalimat'].values.tolist()
y_frasa = df1['frasa idiom'].values.tolist()
y_validasi = df1['validasi'].values.tolist()

In [120]:
X_kalimat[:5], y_frasa[:5], y_validasi[:5]

(['Kucing itu tengah melompati pagar.',
  'Pamanku datang dari Surabaya dan membawa buah tangan.',
  'Di zaman digitalisasi ini, banyak sekali kabar burung yang beredar di media sosial.',
  'Adik melihat kereta.',
  'Kaca mata tebalnya menunjukkan bahwa ia kutu buku.\xa0'],
 ['none', 'buah tangan', 'kabar burung', 'none', 'kutu buku'],
 ['bukan_idiom', 'idiom', 'idiom', 'bukan_idiom', 'idiom'])

In [121]:
predictions = model.predict(X_kalimat)

In [122]:
predictions[:5]

[('Kucing itu tengah melompati pagar.', 'kalimat_biasa', 'none'),
 ('Pamanku datang dari Surabaya dan membawa buah tangan.',
  'kalimat_idiom',
  'buah tangan'),
 ('Di zaman digitalisasi ini, banyak sekali kabar burung yang beredar di media sosial.',
  'kalimat_idiom',
  'kabar burung'),
 ('Adik melihat kereta.', 'kalimat_biasa', 'none'),
 ('Kaca mata tebalnya menunjukkan bahwa ia kutu buku.',
  'kalimat_idiom',
  'none')]

In [123]:
idiom_predicted = [predictions[x][2] for x in range(len(predictions))]
idiom_predicted[:5]

['none', 'buah tangan', 'kabar burung', 'none', 'none']

In [124]:
y_predicted = ['idiom' if y_frasa[i]==idiom_predicted[i] and idiom_predicted[i]!='none'  else 'bukan_idiom' for i in range(len(idiom_predicted))]
y_predicted[:5]

['bukan_idiom', 'idiom', 'idiom', 'bukan_idiom', 'bukan_idiom']

In [None]:
y_validasi = [1 if i=='idiom' else 0 for i in y_validasi]
y_predicted = [1 if i=='idiom' else 0 for i in y_predicted]

y_validasi[:5], y_predicted[:5]

In [84]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
label_name = ['bukan_idiom', 'idiom']
print('Accuracy:', accuracy_score(y_validasi, y_predicted))
print('Precision:', precision_score(y_validasi, y_predicted))
print('Recall:', recall_score(y_validasi, y_predicted))
print('F1-score:', f1_score(y_validasi, y_predicted))

Accuracy: 0.8
Precision: 1.0
Recall: 0.8
F1-score: 0.888888888888889


## EXPERIMEN

In [19]:
list_idiom_exmp = df['frasa idiom'].unique()
idiom_exmp_df = pd.DataFrame(list_idiom_exmp, columns =['idiom_example'])
idiom_exmp_df

Unnamed: 0,idiom_example
0,membanting tulang
1,buah bibir
2,gulung tikar
3,banting tulang
4,jago merah
...,...
164,tunas bangsa
165,menusuk hati
166,banting setir
167,bunga bangsa


In [20]:
idiom_exmp_df.to_csv('idiom-example.csv', index=False)

In [12]:
import torch
import re
import string
from torch import clamp
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity


class TokenSimilarity:

    def load_pretrained(self, from_pretrained:str="indobenchmark/indobert-base-p1"):
        self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
        self.model = AutoModel.from_pretrained(from_pretrained)
        
    def __cleaning(self, text:str):
        # clear punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        # clear multiple spaces
        text = re.sub(r'/s+', ' ', text).strip()

        return text
        
    def __process(self, first_token:str, second_token:str):
        inputs = self.tokenizer([first_token, second_token],
                                max_length=self.max_length,
                                truncation=self.truncation,
                                padding=self.padding,
                                return_tensors='pt')

        attention = inputs.attention_mask

        outputs = self.model(**inputs)

        # get the weights from the last layer as embeddings
        embeddings = outputs[0] # when used in older transformers version
        # embeddings = outputs.last_hidden_state # when used in newer one

        # add more dimension then expand tensor
        # to match embeddings shape by duplicating its values by rows
        mask = attention.unsqueeze(-1).expand(embeddings.shape).float()

        masked_embeddings = embeddings * mask
        
        # MEAN POOLING FOR 2ND DIMENSION
        # first, get sums by 2nd dimension
        # second, get counts of 2nd dimension
        # third, calculate the mean, i.e. sums/counts
        summed = masked_embeddings.sum(1)
        counts = clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed/counts

        # return mean pooling as numpy array
        return mean_pooled.detach().numpy()
        
    def predict(self, first_token:str, second_token:str,
                return_as_embeddings:bool=False, max_length:int=16,
                truncation:bool=True, padding:str="max_length"):
        self.max_length = max_length
        self.truncation = truncation
        self.padding = padding

        first_token = self.__cleaning(first_token)
        second_token = self.__cleaning(second_token)

        mean_pooled_arr = self.__process(first_token, second_token)
        if return_as_embeddings:
            return mean_pooled_arr

        # calculate similarity
        similarity = cosine_similarity([mean_pooled_arr[0]], [mean_pooled_arr[1]])

        return similarity


# word_sim = torch.load('/content/content/model/word_sim.bin')

In [None]:
from google_drive_downloader import GoogleDriveDownloader as gdd
gdd.download_file_from_google_drive(file_id='1VoS18t95hKd7XvbtddC8jvYmRHI7TXYG',
                                    dest_path='content/model.zip',
                                    unzip=True)

Downloading 1VoS18t95hKd7XvbtddC8jvYmRHI7TXYG into content/model.zip... Done.
Unzipping...



In [None]:
!pip install simpletransformers

In [None]:
!pip install transformers

In [None]:
from torch._C import Stream
import torch
import dill
from nltk.tokenize import WordPunctTokenizer
import nltk
import math
from transformers import BertTokenizer

class IdiomIdentification():

  def __init__(self):
    self.classification_model = torch.load('/content/content/model/classification_model.bin')
    self.hmm_tagger_model = dill.load(open('/content/tagger_model.dill', 'rb'))
    self.similarity_model = torch.load('/content/content/model/word_sim.bin')
    self.truth_discovery_model = dill.load(open('/content/truth_discovery.dill', 'rb'))

  def idiom_classification(self, kalimat):
    class_names = ['kalimat_biasa', 'kalimat_idiom']
    predictions = self.classification_model.predict([kalimat])
    kategori = class_names[int(predictions[0])]
    return kategori
  
  def hasNumbers(inputString):
    result = False
    for char in list(inputString):
        if(char.isdigit()):
            result = True
    return result

  def check_tag(self, word, tag):
    punc = list(string.punctuation)
    punc.append('.')
    punc.append(',')
    punc.append('"')
    punc.append("'")
    
    dates = ['Januari','Februari','Maret','April','Mei','Juni','Juli','Agustus','September','Oktober','November','Desember',\
            'Jan','Feb','Mar','Apr','Mei','Jun','Jul','Agt','Sep','Okt','Nov','Des',\
            'januari','februari','maret','april','mei','juni','juli','agustus','september','oktober','november','desember',\
            'Senin','Selasa','Rabu','Kamis','Jumat','Sabtu','Minggu'
        ]
    
    if(word in dates):
        tag = 'DATE'
    
    if(word in punc):
        tag = 'Z'
        
    if(tag == 'CD' and word.isdigit()):
        tag = 'CD'
        
    if(tag in ['SYM','Z','CD','MD'] and word.upper() != word and self.hasNumbers(word) == False \
      and word[-3:] not in ['nya','kah','lah']):
        tag = 'NNP'
    
    if(tag == 'NN' and word[:1].upper() == word):
        tag = 'NNP'
        
    if(tag == 'NNP' and word.lower() == word):
        tag = 'NN'
    
    if(tag == 'NNP' and len(word) == 1):
        tag = 'NN'
        
    if(tag == 'FW' and word.lower() == word):
        tag = 'NN'
        
    return word, tag

  def pos_tagging(self, kalimat):
    word_punct_tokenizer = WordPunctTokenizer()
    kalimat_token = word_punct_tokenizer.tokenize(kalimat)
    tagging = self.hmm_tagger_model.tag(kalimat_token)
    final_tag = []
    for pt in tagging:
      w,t = self.check_tag(pt[0], pt[1])
      final_tag.append((w,t))

    return final_tag

  def chunking(self, kalimat_tagged):
    grammar = ["CHUNK: {<NN>{2,}}", "CHUNK: {<NN><CD>}", "CHUNK: {<CD><NN>}", "CHUNK: {<NNP><NN>}", "CHUNK: {<VB><NN>}",
               "CHUNK: {<VB><JJ>}", "CHUNK: {<VB><CD>}", "CHUNK: {<JJ><NN>}", "CHUNK: {<NN><JJ>}", "CHUNK: {<JJ><JJ>}"]
    
    extract = []

    for i in grammar:
      cp = nltk.RegexpParser(i)
      result = cp.parse(kalimat_tagged)

      leaves = [chunk.leaves() for chunk in result if ((type(chunk) == nltk.tree.Tree) and chunk.label() == 'CHUNK')]
      noun_bigram_groups = [list(nltk.bigrams([w for w, t in leaf])) for leaf in leaves]

      ph = [' '.join(nouns) for group in noun_bigram_groups for nouns in group]
      extract = extract + ph

    return extract

  def count_score_similarity(self, frasa):
    token = frasa.split()
    similarity_score = self.similarity_model.predict(token[0], token[1])[0][0]

    return similarity_score

  def similarity(self, frasa):
    frasa_pred = []
    for f in frasa:
      sim_score = self.count_score_similarity(f)
      if sim_score > 0.5:
        frasa_pred.append(f)

    return frasa_pred

  def validasi(self, frasa):
    frasa_idiom = []
    for f in frasa:
      kategori = self.truth_discovery_model.predict([f])[0]
      if kategori == 1:
        frasa_idiom.append(f)
    
    return frasa_idiom

  def _predict(self, kalimat):
    klasifikasi = self.idiom_classification(kalimat)
    
    if (klasifikasi == 'kalimat_biasa'):
      frasa = 'none'
      hasil = kalimat, klasifikasi, frasa
    else:
      postag = self.pos_tagging(kalimat)
      frasa_chunk = self.chunking(postag)
      frasa_pred = self.similarity(frasa_chunk)
      frasa_idiom = self.validasi(frasa_pred)
      if len(frasa_idiom) == 0:
        frasa = 'none'
      else:
        frasa = frasa_idiom[0]
      hasil = kalimat, klasifikasi, frasa

    return hasil

  def predict(self, X):
    predicted_result = [self._predict(x) for x in X]
    return predicted_result

In [None]:
token1 = 'kutu buku'
token2 = 'Anak kutu buku itu menjuarai lomba matematika.'

word_punct_tokenizer = WordPunctTokenizer()
kalimat_token = word_punct_tokenizer.tokenize(token2)

similarity_score1 = 0
# for w in kalimat_token:
#   print(word_sim.predict(token1, w)[0][0])
#   similarity_score1 = similarity_score1 + word_sim.predict(token1, w)[0][0]

similarity_score1 = similarity_score1 + word_sim.predict(token1, token2)[0][0]

# token = token1.split()
# similarity_score1 = similarity_score1 + word_sim.predict(token[0], token[1])[0][0]

similarity_score1


In [None]:
import torch
classification_model = torch.load('/content/content/model/classification_model.bin')

In [None]:
class_names = ['kalimat_biasa', 'kalimat_idiom']
predictions= classification_model.predict(['Anak kutu buku itu menjuarai lomba matematika.'])
class_names[int(predictions[0])]

In [None]:
# load model\
import dill
with open('/content/tagger_model.dill', 'rb') as f:
    hmm_tagger = dill.load(f)

In [None]:
from nltk.tokenize import WordPunctTokenizer
kalimat = "Anak kutu buku itu menjuarai lomba matematika."
word_punct_tokenizer = WordPunctTokenizer()
kalimat_token = word_punct_tokenizer.tokenize(kalimat)
pos_tagging = hmm_tagger.tag(kalimat_token)
pos_tagging

In [None]:
def chunk(kalimat_tagged):
  grammar = ["CHUNK: {<NN>{2,}}", "CHUNK: {<NN><CD>}", "CHUNK: {<CD><NN>}", "CHUNK: {<NNP><NN>}", "CHUNK: {<VB><NN>}",
               "CHUNK: {<VB><JJ>}", "CHUNK: {<VB><CD>}", "CHUNK: {<JJ><NN>}", "CHUNK: {<NN><JJ>}", "CHUNK: {<JJ><JJ>}"]
    
  extract = []

  for i in grammar:
    cp = nltk.RegexpParser(i)
    result = cp.parse(kalimat_tagged)

    leaves = [chunk.leaves() for chunk in result if ((type(chunk) == nltk.tree.Tree) and chunk.label() == 'CHUNK')]
    noun_bigram_groups = [list(nltk.bigrams([w for w, t in leaf])) for leaf in leaves]

    ph = [' '.join(nouns) for group in noun_bigram_groups for nouns in group]
    extract = extract + ph

  return extract

In [None]:
frasa = chunk(pos_tagging)
frasa

In [None]:
final_tagged = []
for pt in pos_tagging:
  w,t = xcheck_tag(pt[0], pt[1])
  final_tagged.append((w,t))

final_tagged

In [None]:
# load model
import dill
with open('/content/truth_discovery.dill', 'rb') as f:
    td_model = dill.load(f)

In [None]:
trainer_file = open('/content/truth_discovery.dill', 'rb')
trainer_object = dill.load(open('/content/truth_discovery.dill', 'rb'))

In [None]:
import math
pred = trainer_object.predict(['anak bawang'])[0]
pred

In [None]:
token1 = 'meja'
token2 = 'hijau'
similarity_score1 = word_sim.predict(token1, token2)
similarity_score1

In [None]:
token1 = 'anak bawang'
token2 = 'Andi selalu menjadi anak bawang di kelasnya.'
similarity_score1 = word_sim.predict(token1, token2)
similarity_score1

In [None]:
token1 = 'anak bawang'
token2 = 'bawang'
similarity_score1 = word_sim.predict(token1, token2)
similarity_score1
# "Anak kutu buku itu menjuarai lomba matematika."

token = token1.split()
token