In [1]:
import pandas as pd
import numpy as np
import numpy.random as nr

In [2]:
document1 = {0: [['lawang-sewu/NN', 0.09],
                 ['lantai/NN', 0.07],
                 ['bawah/NN', 0.07],
                 ['tanah/NN', 0.07],
                 ['aula/NN', 0.04],
                 ['pendingin/NN', 0.02],
                 ['air/NN', 0.02],
                 ['memiliki/VBT', 0.02],
                 ['terdapat/VBT', 0.01]],
             1: [['lawang-sewu/NN', 0.08],
                 ['landmark/NN', 0.07],
                 ['semarang/NN', 0.07],
                 ['jalan/NN', 0.03],
                 ['pemuda/NN', 0.03],
                 ['pertempuran/NN', 0.01],
                 ['menjadi/VBT', 0.01],
                 ['lokasi/NN', 0.01],
                 ['hebat/JJ', 0.01]]}

In [3]:
document = document1

# Grammar

In [4]:
import re, random
from collections import OrderedDict, defaultdict

In [5]:
NP = ['_NN', '_NN _JJ', '_NN _SC _JJ', '_NNG', '_NNP', '_NP _PP']
VP = ['_VBT _NN', '_VBT _NN _NN', '_VBT _NN _CC _NN', '_VBT _NP', '_VBI', '_JJ']

In [6]:
def get_list_tag(dict_words_by_tag):
    result = []
    for key in dict_words_by_tag:
        result.append(key)
    return result

In [7]:
def generate_base_grammar(list_tag):
    result = {}
    
    if '_VBT' not in list_tag:
        list_tag.append('_VBT')
    
    S = {"_S": ["_NP _VP"]}
    PP = {"_PP": ["_IN _NP"]}
    
    if '_JJ' in list_tag:
        NP_RULES = generate_NP(list_tag)
        NP = {"_NP": NP_RULES}
        if check_VP(list_tag):
            VP_RULES = ['_PP', '_JJ'] + generate_VP(list_tag)
            VP = {"_VP": VP_RULES}
        else:
            VP = {"_VP": ['_PP', '_JJ']}

        for r in [S, NP, VP, PP]:
            result.update(r)
        return result
    else:
        NP_RULES = remove_JJ(generate_NP(list_tag))
        NP = {"_NP": NP_RULES}
        if check_VP(list_tag):
            VP_RULES = ['_PP'] + remove_JJ(generate_VP(list_tag))
            VP = {"_VP": VP_RULES}
        else:
            VP = {"_VP": ['_PP']}
            
        for r in [S, NP, VP, PP]:
            result.update(r)
        return result

def generate_NP(list_tag):
    result = []
    for tag in list_tag:
        for words in NP:
            if re.search(r'\b' + tag + r'\b', words):
                result.append(words)
    return list(OrderedDict.fromkeys(result))
    
def check_VP(list_tag):
    for tag in list_tag:
        if 'V' in tag:
            return True
    return False

def generate_VP(list_tag):
    result = []
    for tag in list_tag:
        for words in VP:
            if re.search(r'\b' + tag + r'\b', words):
                result.append(words)
    return list(OrderedDict.fromkeys(result))

def remove_JJ(list_tag):
    result = []
    for tag in list_tag:
        if '_JJ' in tag:
            continue
        else:
            result.append(tag)
    return result

In [8]:
def generate_words_grammar(dict_words_by_tag):
    result = {}
    IN = {"_IN": ['di', 'sebagai']}
    CC = {"_CC": ['dan']}
    SC = {"_SC": ['yang']}
    ADD_VBT = {"_VBT": ['memiliki', 'adalah', 'merupakan', 'terdapat', 'yaitu', 'sebagai', 'mempunyai']}
    
    WORDS = dict_words_by_tag
    if '_VBT' in WORDS:
        for word in ADD_VBT["_VBT"]:
            if word not in WORDS['_VBT']:
                WORDS['_VBT'].append(word)
    else:
        WORDS.update(ADD_VBT)
            
    for r in [IN, CC, SC, WORDS]:
        result.update(r)
    return result

In [9]:
def organize_words_by_tag(list_words):
    result = defaultdict(list)
    
    i = []
    for s in list_words:
        word, pwz = s[0], s[1]
        
        wrd = word.split('/')[0]
        tag = word.split('/')[1]
        result['_'+tag].append([wrd, pwz])
    return dict(result)

In [10]:
def split_word_pwz(dict_words_pwz_by_tag):
    dict_word_by_tag = defaultdict(list)
    dict_pwz_by_tag = defaultdict(list)
    
    for key, values in dict_words_pwz_by_tag.items():
        for data in values:
            word, pwz = data[0], data[1]
            dict_word_by_tag[key].append(word)
            dict_pwz_by_tag[key].append(pwz)
    return dict(dict_word_by_tag), dict(dict_pwz_by_tag)

In [11]:
def create_grammar(dict_words_by_topic):
    grammar = {}
    
    dict_words_pwz_by_tag = organize_words_by_tag(dict_words_by_topic)
    list_tag = get_list_tag(dict_words_pwz_by_tag)
    base_grammar = generate_base_grammar(list_tag)
    dict_word_by_tag, dict_pwz_by_tag = split_word_pwz(dict_words_pwz_by_tag)
    words_grammar = generate_words_grammar(dict_word_by_tag)
    
    for r in [base_grammar, words_grammar]:
        grammar.update(r)
    return grammar, dict_pwz_by_tag 

In [12]:
def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

In [13]:
def is_terminal(token):
    return token[0] != "_"

In [14]:
sys_random = random.SystemRandom()

def expand(grammar, tokens, dict_pwz_by_tag):
    print(tokens)
    for i, token in enumerate(tokens):

        # skip over terminals
        if is_terminal(token): continue

        # if we get here, we found a non-terminal token
        # so we need to choose a replacement at random
        replacement = random.choice(grammar[token])
        
        if replacement == '_NN':
            weight = [x/sum(dict_pwz_by_tag['_NN']) for x in dict_pwz_by_tag['_NN']]
            print(weight)
            replacement = nr.choice(grammar['_NN'], p=weight)
            
        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
       
        # now call expand on the new list of tokens
        return expand(grammar, tokens, dict_pwz_by_tag)

    # if we get here we had all terminals and are done
    return tokens

In [15]:
def generate_sentence(grammar, dict_pwz_by_tag):
    return expand(grammar, ["_S"], dict_pwz_by_tag)

In [16]:
def create_sentences_from_data(dict_data):
    result = {}
    for topic, words in dict_data.items():
        sentence = []
        grammar, dict_pwz_by_tag = create_grammar(words)
        print(grammar)
        for s in range(100):
            sentence.append(' '.join(generate_sentence(grammar, dict_pwz_by_tag)))
        result = merge_two_dicts(result, {topic: sentence})
    return result

In [17]:
dict_story = create_sentences_from_data(document)

{'_S': ['_NP _VP'], '_NP': ['_NN'], '_VP': ['_PP', '_VBT _NN', '_VBT _NN _NN', '_VBT _NN _CC _NN', '_VBT _NP'], '_PP': ['_IN _NP'], '_IN': ['di', 'sebagai'], '_CC': ['dan'], '_SC': ['yang'], '_NN': ['lawang-sewu', 'lantai', 'bawah', 'tanah', 'aula', 'pendingin', 'air'], '_VBT': ['memiliki', 'terdapat', 'adalah', 'merupakan', 'yaitu', 'sebagai', 'mempunyai']}
['_S']
['_NP', '_VP']
[0.23684210526315785, 0.18421052631578946, 0.18421052631578946, 0.18421052631578946, 0.10526315789473682, 0.05263157894736841, 0.05263157894736841]
['bawah', '_VP']
['bawah', '_VBT', '_NN', '_NN']
['bawah', 'memiliki', '_NN', '_NN']
['bawah', 'memiliki', 'aula', '_NN']
['bawah', 'memiliki', 'aula', 'lantai']
['_S']
['_NP', '_VP']
[0.23684210526315785, 0.18421052631578946, 0.18421052631578946, 0.18421052631578946, 0.10526315789473682, 0.05263157894736841, 0.05263157894736841]
['lawang-sewu', '_VP']
['lawang-sewu', '_VBT', '_NP']
['lawang-sewu', 'yaitu', '_NP']
[0.23684210526315785, 0.18421052631578946, 0.184210

['jalan', 'yang', '_JJ', '_VP']
['jalan', 'yang', 'hebat', '_VP']
['jalan', 'yang', 'hebat', '_VBT', '_NN', '_NN']
['jalan', 'yang', 'hebat', 'mempunyai', '_NN', '_NN']
['jalan', 'yang', 'hebat', 'mempunyai', 'jalan', '_NN']
['jalan', 'yang', 'hebat', 'mempunyai', 'jalan', 'pertempuran']
['_S']
['_NP', '_VP']
[0.2666666666666666, 0.2333333333333333, 0.2333333333333333, 0.09999999999999998, 0.09999999999999998, 0.033333333333333326, 0.033333333333333326]
['lokasi', '_VP']
['lokasi', '_PP']
['lokasi', '_IN', '_NP']
['lokasi', 'di', '_NP']
['lokasi', 'di', '_NN', '_JJ']
['lokasi', 'di', 'pertempuran', '_JJ']
['lokasi', 'di', 'pertempuran', 'hebat']
['_S']
['_NP', '_VP']
['_NN', '_JJ', '_VP']
['pemuda', '_JJ', '_VP']
['pemuda', 'hebat', '_VP']
['pemuda', 'hebat', '_VBT', '_NN', '_NN']
['pemuda', 'hebat', 'sebagai', '_NN', '_NN']
['pemuda', 'hebat', 'sebagai', 'pemuda', '_NN']
['pemuda', 'hebat', 'sebagai', 'pemuda', 'semarang']
['_S']
['_NP', '_VP']
['_NN', '_SC', '_JJ', '_VP']
['semarang'

['pemuda', '_SC', '_JJ', '_VP']
['pemuda', 'yang', '_JJ', '_VP']
['pemuda', 'yang', 'hebat', '_VP']
['pemuda', 'yang', 'hebat', '_VBT', '_NN', '_NN']
['pemuda', 'yang', 'hebat', 'adalah', '_NN', '_NN']
['pemuda', 'yang', 'hebat', 'adalah', 'jalan', '_NN']
['pemuda', 'yang', 'hebat', 'adalah', 'jalan', 'lawang-sewu']
['_S']
['_NP', '_VP']
['_NN', '_SC', '_JJ', '_VP']
['landmark', '_SC', '_JJ', '_VP']
['landmark', 'yang', '_JJ', '_VP']
['landmark', 'yang', 'hebat', '_VP']
['landmark', 'yang', 'hebat', '_VBT', '_NN', '_NN']
['landmark', 'yang', 'hebat', 'yaitu', '_NN', '_NN']
['landmark', 'yang', 'hebat', 'yaitu', 'landmark', '_NN']
['landmark', 'yang', 'hebat', 'yaitu', 'landmark', 'lawang-sewu']
['_S']
['_NP', '_VP']
[0.2666666666666666, 0.2333333333333333, 0.2333333333333333, 0.09999999999999998, 0.09999999999999998, 0.033333333333333326, 0.033333333333333326]
['landmark', '_VP']
['landmark', '_VBT', '_NN', '_NN']
['landmark', 'mempunyai', '_NN', '_NN']
['landmark', 'mempunyai', 'lawan

In [18]:
dict_story

{0: ['bawah memiliki aula lantai',
  'lawang-sewu yaitu bawah',
  'lantai yaitu tanah dan lawang-sewu',
  'lantai di air',
  'aula di air',
  'bawah di lantai',
  'lawang-sewu mempunyai lantai dan lantai',
  'tanah sebagai lawang-sewu',
  'tanah terdapat tanah dan lantai',
  'lawang-sewu adalah lantai',
  'tanah memiliki bawah dan aula',
  'pendingin merupakan bawah lantai',
  'tanah sebagai bawah',
  'bawah terdapat air dan lawang-sewu',
  'aula merupakan tanah',
  'aula adalah lawang-sewu dan lantai',
  'lawang-sewu terdapat bawah dan bawah',
  'lantai yaitu bawah',
  'tanah terdapat pendingin air',
  'lawang-sewu di lawang-sewu',
  'tanah adalah bawah',
  'tanah di bawah',
  'pendingin merupakan lantai dan air',
  'bawah sebagai tanah',
  'pendingin sebagai aula',
  'bawah yaitu bawah',
  'lantai yaitu tanah dan lantai',
  'lantai terdapat tanah aula',
  'bawah di lawang-sewu',
  'tanah sebagai lantai',
  'lawang-sewu memiliki lantai',
  'lantai mempunyai lantai',
  'bawah mempunyai