In [1]:
from collections import OrderedDict, defaultdict
import re
from pprint import pprint

In [2]:
data = {0: ["shelter/NN", "naik/VBI", "turun/VBI", "kampung/NN", "pantai/NN", "marina/NN", "pelangi/NN", "jalan/NN", "simpang/NN", "deket/JJ"],
        1: ["simpang/NN", "arus/NN", "menit/NN", "lintas/NN", "lalu/JJ", "tugu/NN", "muda/JJ", "parkir/NN", "kawasan/NN", "bawah/NN"],
        2: ["semarang/NN", "simpang/NN", "bhayangkara/NN", "lapangan/NN", "jateng/NN", "hut/NN", "daerah/NN", "gue/NN", "hari/NN", "rakyat/NN"],
        3: ["semarang/NN", "canyon/NN", "brown/NN", "simpang/NN", "pak/NN", "melalui/VBT", "lokasi/NN", "kota/NN", "sih/NN", "jasa/NN"],
        4: ["lawang/NN", "sewu/NN", "sama/JJ", "foto/NN", "jadi/JJ", "film/NN", "orang/NN", "nonton/NN", "resa/NN", "depan/NN"],
        5: ["lawang/NN", "sewu/NN", "semarang/NN", "jawa/NN", "tengah/NN", "indonesia/NN", "api/NN", "kereta/NN", "kantor/NN", "ada/VBI"],
        6: ["bawah/NN", "tempat/NN", "lantai/NN", "tanah/NN", "ruang/NN", "utama/JJ", "basement/NN", "air/NN", "guide/NN", "depan/NN"],
        7: ["sewu/NN", "lawang/NN", "dibangun/VBT", "tahun/NN", "bangunan/NN", "belanda/NN", "memiliki/VBT", "ning/NNP", "salah/JJ", "aku/NN"],
        8: ["kota/NN", "koridor/NN", "simpang/NN", "semarang/NN", "sewu/NN", "lawang/NN", "lama/JJ", "sam/NN", "kong/NN", "poo/NN"],
        9: ["simpang/NN", "kediri/NN", "gumul/NN", "kabupaten/NN", "budaya/NN", "pekan/NN", "jalan/NN", "pariwisata/NN", "radio/NN", "monumen/NN"],
        10: ["pecinan/NN", "kota/NN", "rumah/NN", "magelang/NN", "semanis/NN", "jam/NN", "aja/NN", "salah/JJ", "gak/NN", "keep/NN"]
       }

In [3]:
NP = ['_NN', '_NNP', '_NNG', '_NN _DT', '_DT _NNP', '_DT _NNG', '_NN _JJP', '_NNP _JJP', '_NP _CC _NP']
VP = ['_VBI _NP', '_VBI _IN _NP', '_VBI _NP _PP', '_VBT', '_VBT _NP', '_PP', '_JJP']

In [4]:
def organize_words_by_tag(dict_words_by_topic):
    '''organize words by tag'''
    result = defaultdict(list)
    i = []
    for s in dict_words_by_topic:
        word = s.split('/')[0]
        tag = s.split('/')[1]
        result['_'+tag].append(word)
    return dict(result)

In [5]:
def get_story_tag(dict_words_by_tag):
    '''Get tags in dictionary contains tag and words'''
    result = []
    for key in dict_words_by_tag:
        result.append(key)
    return result

In [6]:
def generate_base_grammar(list_tag):
    result = {}
    S = {"_S": ["_NP _VP"]}
    PP = {"_PP": ["_IN _NP"]}
    if '_JJ' in list_tag:
        NP_RULES = ['_NP _CC _NP'] + generate_NP(list_tag)
        NP = {"_NP": NP_RULES}
        if check_VP(list_tag):
            VP_RULES = ['_PP', '_JJP'] + generate_VP(list_tag)
            VP = {"_VP": VP_RULES}
        else:
            VP = {"_VP": ['_PP', '_JJP']}
        JJP = {"_JJP": ['_JJ', '_JJ _CC _JJ']}
        
        for r in [S, NP, VP, PP, JJP]:
            result.update(r)
        return result
    else:
        NP_RULES = ['_NP _CC _NP'] + remove_JJP(generate_NP(list_tag))
        NP = {"_NP": NP_RULES}
        if check_VP(list_tag):
            VP_RULES = ['_PP'] + remove_JJP(generate_VP(list_tag))
            VP = {"_VP": VP_RULES}
        else:
            VP = {"_VP": ['_PP']}
        
        for r in [S, NP, VP, PP]:
            result.update(r)
        return result
        
def check_VP(list_tag):
    for tag in list_tag:
        if '_V' in tag:
            return True
    return False

def remove_JJP(list_tag):
    result = []
    for tag in list_tag:
        if '_JJ' in tag:
            continue
        else:
            result.append(tag)
    return result
    

def generate_NP(list_tag):
    result = []
    for tag in list_tag:
        for words in NP:
            if re.search(r'\b' + tag + r'\b', words):
                result.append(words)
    return list(OrderedDict.fromkeys(result))

def generate_VP(list_tag):
    result = []
    for tag in list_tag:
        for words in VP:
            if re.search(r'\b' + tag + r'\b', words):
                result.append(words)
    return list(OrderedDict.fromkeys(result))

In [7]:
def generate_words_grammar(dict_words_by_tag):
    result = {}
    IN = {"_IN": ['di', 'ke', 'dari']}
    DT = {"_DT": ['ini', 'itu']}
    CC = {"_CC": ['dan', 'atau']}
    WORDS = dict_words_by_tag
    for r in [IN, DT, CC, WORDS]:
        result.update(r)
    return result

In [8]:
def create_grammar(dict_words_by_topic):
    result = {}
    dict_words_by_tag = organize_words_by_tag(dict_words_by_topic)
    list_tag = get_story_tag(dict_words_by_tag)
    
    base_grammar = generate_base_grammar(list_tag)
    words_grammar = generate_words_grammar(dict_words_by_tag)
    for r in [base_grammar, words_grammar]:
        result.update(r)
    return result

In [9]:
import random
def is_terminal(token):
    return token[0] != "_"

In [10]:
def expand(grammar, tokens):
    for i, token in enumerate(tokens):

        # skip over terminals
        if is_terminal(token): continue

        # if we get here, we found a non-terminal token
        # so we need to choose a replacement at random
        replacement = random.choice(grammar[token])

        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]

        # now call expand on the new list of tokens
        return expand(grammar, tokens)

    # if we get here we had all terminals and are done
    return tokens

In [11]:
def generate_sentence(grammar):
    return expand(grammar, ["_S"])

In [12]:
grammar = create_grammar(data[0])
for i in range(10):
    print(generate_sentence(grammar))
#     print(' '.join(generate_sentence(grammar)))

['jalan', 'deket', 'atau', 'deket', 'atau', 'shelter', 'deket', 'deket']
['simpang', 'turun', 'kampung', 'deket', 'ke', 'simpang', 'deket', 'atau', 'deket']
['marina', 'deket', 'naik', 'pelangi', 'dan', 'kampung', 'itu', 'ke', 'shelter']
['pelangi', 'deket', 'naik', 'shelter', 'deket', 'dari', 'simpang', 'deket', 'atau', 'deket', 'dan', 'shelter', 'itu']
['kampung', 'dan', 'jalan', 'deket', 'atau', 'deket']
['pantai', 'turun', 'pantai', 'ini', 'dari', 'jalan', 'itu']
['pantai', 'itu', 'naik', 'kampung', 'itu', 'ke', 'pelangi', 'itu']
['shelter', 'deket', 'atau', 'shelter', 'atau', 'kampung', 'deket', 'dan', 'deket', 'turun', 'pelangi', 'deket', 'atau', 'deket']
['pantai', 'deket', 'atau', 'deket', 'turun', 'ke', 'pelangi', 'deket', 'dan', 'deket']
['pelangi', 'turun', 'simpang', 'deket', 'atau', 'deket']


In [13]:
def merge_two_dicts(x, y):
    """Given two dicts, merge them into a new dict as a shallow copy."""
    z = x.copy()
    z.update(y)
    return z

In [14]:
def create_sentence(data):
    result = {}
    for topic, words in data.items():
        sentence = []
        grammar = create_grammar(words)
        for s in range(10):
            sentence.append(' '.join(generate_sentence(grammar)))
        result = merge_two_dicts(result, {topic: sentence})
    return result

In [15]:
a = create_sentence(data)
print(type(a))

<class 'dict'>
