In [51]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import networkx as nx
import fasttext
import pycrfsuite

In [193]:
from spacy import displacy

# Data custom
data = {
    "words": [
        {"text": "John", "tag": "NNP"},
        {"text": "eats", "tag": "VBZ"},
        {"text": "an", "tag": "DT"},
        {"text": "apple", "tag": "NN"},
    ],
    "arcs": [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "right"},
        {"start": 1, "end": 3, "label": "obj", "dir": "right"},
        {"start": 2, "end": 3, "label": "det", "dir": "left"},
    ]
}

# Visualisasi dependensi
displacy.render(data, style="dep", manual=True, jupyter=True)


In [2]:
class EdgeScorer(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(EdgeScorer, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # Hidden layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)         # Output layer
        self.sigmoid = nn.Sigmoid()                # Output: Probabilitas (0-1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [93]:
def word_embedding(word):
    """
    fungsi ini digunakan untuk mendapatkan vektor embedding dari suatu kata
    """
    return f'{list(emb_model.get_word_vector(word))}'

def word_embedding_ns(word):
    return emb_model.get_word_vector(word)

In [94]:
def find_score(sentence, model):
    sentence_split = sentence.split()
    edge = []
    scores = []

    for index_i, word_i in enumerate(sentence_split):
        for index_j, word_j in enumerate(sentence_split):
            if index_i != index_j:
                emb_i = word_embedding_ns(word_i)
                emb_j = word_embedding_ns(word_j)
                input_vec = torch.tensor(np.concatenate((emb_i, emb_j)), dtype=torch.float32).unsqueeze(0)
                with torch.no_grad():
                    prob = model(input_vec).item()
                edge.append([word_i, word_j])
                scores.append(prob)
    return scores, edge

In [38]:
def edge_score_map(model, sentence):
    # Inisialisasi model
    input_x = []

    scores, edge = find_score(sentence, model)

    for i in range(len(scores)):
        input_x.append((edge[i][0], edge[i][1], scores[i]))

    return input_x

In [59]:
def define_input(sentence):
    input_dim = 200
    hidden_dim = 128
    model = EdgeScorer(input_dim, hidden_dim)
    model.load_state_dict(torch.load('edge_scorer.pth'))
    model.eval()

    input_x = edge_score_map(model, sentence)
    return input_x

In [60]:
def mst_parser(sentence):
    edges = define_input(sentence)

    # Membuat graf terarah
    G = nx.DiGraph()
    for u, v, weight in edges:
        G.add_edge(u, v, weight=weight)

    # Mencari MST menggunakan Chu-Liu/Edmonds
    mst = nx.minimum_spanning_arborescence(G)

    # Menampilkan hasil MST
    # for u, v, weight in mst.edges(data=True):
    #     print(f'{v} -> {u} dengan bobot {weight["weight"]}')
    return mst

In [284]:
[x[0] for x in edge_score]

['ngalih', 'gegaene', 'gegaene', 'gegaene', 'gegaene', 'gegaene']

In [282]:
list(mst_parser(sentence).edges)

  model.load_state_dict(torch.load('edge_scorer.pth'))


[('ngalih', 'gegaene'),
 ('gegaene', 'daya'),
 ('gegaene', 'apanga'),
 ('gegaene', 'aluhan'),
 ('gegaene', 'Nangingke'),
 ('gegaene', 'ia')]

In [285]:
construct_head(sentence, sentence.split())

  model.load_state_dict(torch.load('edge_scorer.pth'))


[['Nangingke', 5],
 ['ia', 5],
 ['ngalih', 0],
 ['daya', 5],
 ['apanga', 5],
 ['gegaene', 2],
 ['aluhan', 5]]

In [61]:
def construct_head(sentence, sentence_split):
    edge_score = list(mst_parser(sentence).edges)
    
    tail = [x[1] for x in edge_score]
    dependence_word = []

    for index, value in enumerate(sentence_split):
        if value in tail:
            head = edge_score[int(tail.index(value))]
            dependence_word.append([value, head[0]])
        else:
            dependence_word.append([value])
            
    head_output = []

    for dependence in dependence_word:
        if len(dependence) == 1:
            head_output.append([dependence[0], 0])
        else:
            head_output.append([dependence[0], sentence_split.index(dependence[1])])
    return head_output

In [62]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
        'index=' + str(i),
        'head=' + str(sent[i][2])
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [83]:
def word2features_pos(sent, i):
    """
    fungsi ini digunakan untuk membentuk feature input pada pos
    sent => list kata yang berisikan [word]
    i => index dari kata saat ini
    model => merupakan model word_embedding yang dipanggil untuk mendapatkan vektor kata
    """
    word = sent[i]
    features = {
        'bias':1.0,
        'word': word,
        'emb_word': word_embedding(word), 
        'is_first' : i == 0,
        'is_last' : i == len(sent)-1,
        'is_title' : word[0].upper() == word[0],
        'isupper' : word.upper() == word,
        'islower' : word.lower() == word,
        'prefix-1' : word[0],
        'prefix-2' : word[:2],
        'suffix-1' : word[-1],
        'suffix-2' : word[-2:],

        'prev_word-1': '' if i == 0 else (sent[i-1]),
        'prev_word-1_prefix-1' : '' if i == 0 else (sent[i-1][0]),
        'prev_word-1_prefix-2' : '' if i == 0 else (sent[i-1][:2]),
        'prev_word-1_suffix-1' : '' if i == 0 else (sent[i-1][-1]),
        'prev_word-1_suffix-2' : '' if i == 0 else (sent[i-1][-2:]),

        'prev_word-2' : (sent[i-2][0]) if i > 1 else '',
        'prev_word-2_prefix-1' : (sent[i-2][0]) if i > 1 else '',
        'prev_word-2_prefix-2' : (sent[i-2][:2]) if i > 1 else '',
        'prev_word-2_suffix-1' : (sent[i-2][-1]) if i > 1 else '',
        'prev_word-2_suffix-2' : (sent[i-2][-2:]) if i > 1 else '',

        'next_word-1' : '' if i == len(sent)-1 else (sent[i+1][0]),
        'next_word-1_prefix-1' : '' if i == len(sent)-1 else (sent[i+1][0]),
        'next_word-1_prefix-2' : '' if i == len(sent)-1 else (sent[i+1][:2]),
        'next_word-1_suffix-1' : '' if i == len(sent)-1 else (sent[i+1][-1]),
        'next_word-1_suffix-2' : '' if i == len(sent)-1 else (sent[i+1][-2:]),

        'next_word-2' : (sent[i+2][0]) if i < len(sent)-2 else '',
        'next_word-2_prefix-1' : (sent[i+2][0]) if i < len(sent)-2 else '',
        'next_word-2_prefix-2' : (sent[i+2][:2]) if i < len(sent)-2 else '',
        'next_word-2_suffix-1' : (sent[i+2][-1]) if i < len(sent)-2 else '',
        'next_word-2_suffix-2' : (sent[i+2][-2:]) if i < len(sent)-2 else '',

        'emb_prev_word-1': '' if i == 0 else word_embedding(sent[i-1][0]),
        'emb_prev_word-1_prefix-2' : '' if i == 0 else word_embedding(sent[i-1][:2]),
        'emb_prev_word-1_suffix-2' : '' if i == 0 else word_embedding(sent[i-1][-2:]),

        'emb_prev_word-2' : word_embedding(sent[i-2][0]) if i > 1 else '',
        'emb_prev_word-2_prefix-2' : word_embedding(sent[i-2][:2]) if i > 1 else '',
        'emb_prev_word-2_suffix-2' : word_embedding(sent[i-2][-2:]) if i > 1 else '',

        'emb_next_word-1' : '' if i == len(sent)-1 else word_embedding(sent[i+1][0]),

        'emb_next_word-1_prefix-2' : '' if i == len(sent)-1 else word_embedding(sent[i+1][:2]),
        'emb_next_word-1_suffix-2' : '' if i == len(sent)-1 else word_embedding(sent[i+1][-2:]),

        'emb_next_word-2' : word_embedding(sent[i+2][0]) if i < len(sent)-2 else '',
        'emb_next_word-2_prefix-2' : word_embedding(sent[i+2][:2]) if i < len(sent)-2 else '',
        'emb_next_word-2_suffix-2' : word_embedding(sent[i+2][-2:]) if i < len(sent)-2 else '',

    }
                
    return features

def sent2features_pos(sent):
    """
    fungsi ini merupakan fungsi untuk memanggil fungsi feature dan mengembalikan feature kata pada kalimat
    """
    return [word2features_pos(sent, i) for i in range(len(sent))]

In [232]:
def make_feature2predict(sentence, pos_sentence):
    feature = []
    pos = pos_sentence
    for index, word in enumerate(sentence.split()):
        feature.append([word, pos[index]])
    return feature

In [233]:
def feature_construct(sentence, pos_sentence):
    head_features = construct_head(sentence, sentence.split())
    features = make_feature2predict(sentence, pos_sentence)
    for index, value in enumerate(features):
        features[index].append(head_features[index][1])
    return features

In [67]:
tagger = pycrfsuite.Tagger()
tagger.open('label_scorer.crfsuite')

tagger_pos = pycrfsuite.Tagger()
tagger_pos.open('person_balinese_pos_2.crfsuite')
emb_model = fasttext.load_model('model_fasttext.bin')

In [180]:
sentence = "Nangingke ia ngalih daya apanga gegaene aluhan"
# output_label = tagger.tag(sent2features(feature_construct(sentence)))
test_feature = sent2features_pos(sentence.split())
pos_sentence = tagger_pos.tag(sent2features_pos(sentence.split()))

output_label = tagger.tag(sent2features(feature_construct(sentence)))
output_label

  model.load_state_dict(torch.load('edge_scorer.pth'))


['nmod', 'subj', 'root', 'obj', 'iobj', 'xcomp', 'obj']

In [272]:
edge_score = list(mst_parser(sentence).edges)
edge_score

  model.load_state_dict(torch.load('edge_scorer.pth'))


[('ngalih', 'gegaene'),
 ('gegaene', 'daya'),
 ('gegaene', 'apanga'),
 ('gegaene', 'aluhan'),
 ('gegaene', 'Nangingke'),
 ('gegaene', 'ia')]

In [240]:
def cek(sentence):
    pos_sentence = tagger_pos.tag(sent2features_pos(sentence.split()))
    output_label = tagger.tag(sent2features(feature_construct(sentence, pos_sentence)))
    head_features = construct_head(sentence, sentence.split())
    word, head = zip(*head_features)
    return list(zip(word, head, output_label))

In [235]:
cek(sents)

  model.load_state_dict(torch.load('edge_scorer.pth'))


[('Pan', 1, 'obj'),
 ('Karsa', 7, 'subj'),
 ('ajaka', 1, 'conj'),
 ('pianakne', 1, 'xcomp'),
 ('muani', 1, 'obj'),
 ('nanggap', 1, 'obj'),
 ('upah', 1, 'subj'),
 ('ngae', 0, 'root'),
 ('semer', 1, 'obj'),
 ('di', 1, 'case'),
 ('sisin', 1, 'obj'),
 ('rurunge', 1, 'nmod'),
 ('gede', 1, 'amod')]

In [213]:
import pandas as pd
data = pd.read_excel('dataset/data_new.xlsx')
df = pd.DataFrame(data)

In [214]:
raw_feature = [[]]

temp_index = 0

for index, value in df.iterrows():
    if index != len(df)-1 and df['sentence_id'][index] != df['sentence_id'][index+1]:
        raw_feature.append([])
        temp_index += 1
    # print(raw_feature)
    if value['word'] != '.':
        raw_feature[temp_index].append([value['word'], value['pos_tag'], value['head'], value['deprel']])


In [237]:
word, _, head, label = zip(*raw_feature[0])
sents = " ".join(word)
cek(sents), raw_feature[0]

  model.load_state_dict(torch.load('edge_scorer.pth'))


([('Pan', 1, 'obj'),
  ('Karsa', 7, 'subj'),
  ('ajaka', 1, 'conj'),
  ('pianakne', 1, 'xcomp'),
  ('muani', 1, 'obj'),
  ('nanggap', 1, 'obj'),
  ('upah', 1, 'subj'),
  ('ngae', 0, 'root'),
  ('semer', 1, 'obj'),
  ('di', 1, 'case'),
  ('sisin', 1, 'obj'),
  ('rurunge', 1, 'nmod'),
  ('gede', 1, 'amod')],
 [['Pan', 'NNP', 2, 'comp'],
  ['Karsa', 'NNP', 6, 'subj'],
  ['ajaka', 'IN', 4, 'case'],
  ['pianakne', 'NN', 6, 'subj'],
  ['muani', 'NN', 4, 'appos'],
  ['nanggap', 'VB', 0, 'root'],
  ['upah', 'NN', 6, 'obj'],
  ['ngae', 'VB', 6, 'xcomp'],
  ['semer', 'NN', 8, 'obj'],
  ['di', 'IN', 11, 'case'],
  ['sisin', 'NN', 12, 'nmod'],
  ['rurunge', 'NN', 8, 'nmod'],
  ['gede', 'JJ', 12, 'amod']])

In [254]:
def parseval(y):
    total = 0
    true_head = 0
    true_label = 0
    true_dependensi = 0

    for index in range(len(y)):
        total += len(y[index])
        word, _, head, label = zip(*y[index])
        sent = " ".join(word)
        y_pred = cek(sent)
        _, head_pred, label_pred = zip(*y_pred)
        for i in range(len(head_pred)):
            if head_pred[i] == head[i]:
                true_head += 1
            if label_pred[i] == label[i]:
                true_label += 1
            if head_pred[i] == head[i] and label_pred[i] == label[i]:
                true_dependensi += 1

    uas = true_head / total
    las = true_label / total
    exact = true_dependensi / total
    return uas, las, exact, true_head, true_label, true_dependensi, total


In [264]:
round(len(raw_feature)*0.8)

85.60000000000001

In [289]:
test_count = round(len(raw_feature)*0.8)
test_feature = raw_feature[test_count:]
uas, las, exact, true_head, true_label, true_dependensi, total = parseval(raw_feature)

  model.load_state_dict(torch.load('edge_scorer.pth'))


In [290]:
len(raw_feature)

107

In [291]:
uas, true_head, total

(0.049403747870528106, 87, 1761)

In [292]:
las, true_label, total

(0.4684838160136286, 825, 1761)

In [293]:
exact, true_dependensi, total

(0.03180011357183418, 56, 1761)

In [277]:
head_features = construct_head(sentence, sentence.split())
word, head = zip(*head_features)
head_features

  model.load_state_dict(torch.load('edge_scorer.pth'))


[['Nangingke', 5],
 ['ia', 5],
 ['ngalih', 0],
 ['daya', 5],
 ['apanga', 5],
 ['gegaene', 2],
 ['aluhan', 5]]

In [183]:
sentence_list = sentence.split()
map_output = list(zip(output_label, sentence_list))
map_output

[('nmod', 'Nangingke'),
 ('subj', 'ia'),
 ('root', 'ngalih'),
 ('obj', 'daya'),
 ('iobj', 'apanga'),
 ('xcomp', 'gegaene'),
 ('obj', 'aluhan')]

In [184]:
def construct_displacy(head_features, edge_label):
    displacy_words = []
    displacy_arcs = []

    words, heads = zip(*head_features)
    labels = edge_label
    for index in range(len(heads)):
        if heads[index] != 0:
            dir = "left" if index > heads[index] else "right"
            displacy_arcs.append({
                "start" : index,
                "end" : heads[index],
                "label" : str(labels[index]),
                "dir" : str(dir)
            })
        displacy_words.append({
                "text" : str(words[index]),
                "tag" : str(pos_sentence[index])
            })
    return {
        "words" : displacy_words,
        "arcs" : displacy_arcs
    }


In [185]:
data_1 = construct_displacy(head_features, output_label)
# displacy.render(data_1, style="dep", manual=True, jupyter=True)
data_1

{'words': [{'text': 'Nangingke', 'tag': 'NN'},
  {'text': 'ia', 'tag': 'PR'},
  {'text': 'ngalih', 'tag': 'VB'},
  {'text': 'daya', 'tag': 'NN'},
  {'text': 'apanga', 'tag': 'Z'},
  {'text': 'gegaene', 'tag': 'VB'},
  {'text': 'aluhan', 'tag': 'NN'}],
 'arcs': [{'start': 0, 'end': 5, 'label': 'nmod', 'dir': 'right'},
  {'start': 1, 'end': 5, 'label': 'subj', 'dir': 'right'},
  {'start': 3, 'end': 5, 'label': 'obj', 'dir': 'right'},
  {'start': 4, 'end': 5, 'label': 'iobj', 'dir': 'right'},
  {'start': 5, 'end': 2, 'label': 'xcomp', 'dir': 'left'},
  {'start': 6, 'end': 5, 'label': 'obj', 'dir': 'left'}]}

In [194]:
from spacy import displacy

data_1 = {
    'words': [
        {'text': 'Nangingke', 'tag': 'NN'},
        {'text': 'ia', 'tag': 'PR'},
        {'text': 'ngalih', 'tag': 'VB'},
        {'text': 'daya', 'tag': 'NN'},
        {'text': 'apanga', 'tag': 'Z'},
        {'text': 'gegaene', 'tag': 'VB'},
        {'text': 'aluhan', 'tag': 'NN'}
    ],
    'arcs': [
        {'start': 0, 'end': 5, 'label': 'nmod', 'dir': 'right'},
        {'start': 1, 'end': 5, 'label': 'subj', 'dir': 'right'},
        {'start': 3, 'end': 5, 'label': 'obj', 'dir': 'right'},
        {'start': 4, 'end': 5, 'label': 'iobj', 'dir': 'right'},
        {'start': 5, 'end': 2, 'label': 'xcomp', 'dir': 'left'},
        {'start': 6, 'end': 5, 'label': 'obj', 'dir': 'left'}
    ]
}

displacy.render(data_1, style="dep", manual=True, jupyter=True)


ValueError: max() arg is an empty sequence

In [192]:
from spacy import displacy

# Data custom
data = {
    "words": [
        {"text": "Apeteng", "tag": "NNP"},
        {"text": "eats", "tag": "VBZ"},
        {"text": "an", "tag": "DT"},
        {"text": "apple", "tag": "NN"},
    ],
    "arcs": [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "right"},
        {"start": 1, "end": 3, "label": "obj", "dir": "right"},
        {"start": 2, "end": 3, "label": "det", "dir": "left"},
    ]
}

# Visualisasi dependensi
displacy.render(data, style="dep", manual=True, jupyter=True)


In [275]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 1.1/1.1 MB 2.0 MB/s eta 0:00:00
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
   ---------------------------------------- 586.9/586.9 kB 2.6 MB/s eta 0:00:00
Installing collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.10.1


In [276]:
import stanza
from stanza.utils.visualize import visualize_parse

# Membuat data dalam format Stanza
words = ['Nangingke', 'ia', 'ngalih', 'daya', 'apanga', 'gegaene', 'aluhan']
dependencies = [
    (0, 5, 'nmod'), (1, 5, 'subj'), (3, 5, 'obj'), (4, 5, 'iobj'),
    (5, 2, 'xcomp'), (6, 5, 'obj')
]

# Membuat pipeline Stanza untuk bahasa Indonesia
nlp = stanza.Pipeline('id')

# Membuat doc kosong untuk diisi
doc = stanza.models.common.doc.Document()

# Menambahkan kata-kata ke dalam doc
for word in words:
    doc.sentences.append(stanza.models.common.doc.Sentence([stanza.models.common.doc.Word(word)]))

# Menambahkan dependensi ke dalam doc (menyesuaikan format)
for dep in dependencies:
    doc.sentences[dep[1]].words[0].head = doc.sentences[dep[0]].words[0]
    doc.sentences[dep[1]].words[0].deprel = dep[2]

# Visualisasi
visualize_parse(doc)


ModuleNotFoundError: No module named 'stanza.utils.visualize'