In [1]:
import pandas as pd
import numpy as np
import numpy.random as nr

In [2]:
document1 = {0: [['memotret/VBT', 0.07],
                 ['akses/NN', 0.04],
                 ['pintu/NN', 0.03],
                 ['bawah/NN', 0.03],
                 ['tanah/NN', 0.02],
                 ['aji/NN', 0.01],
                 ['fox/NN', 0.01],
                 ['lemas/JJ', 0.01],
                 ['berbarengan/VBT', 0.01]],
             1: [['om/NN', 0.08],
                 ['ditutup/NN', 0.07],
                 ['lihat/VBT', 0.06],
                 ['basement/NN', 0.05],
                 ['berkeliling/VBI', 0.04],
                 ['memotret/VBT', 0.04],
                 ['lantai/NN', 0.03],
                 ['foto/NN', 0.01],
                 ['lelah/JJ', 0.01]]}

In [3]:
document = document1

# Grammar

In [4]:
import re, random
from collections import OrderedDict, defaultdict

In [5]:
NP = ['_NN', '_NNG', '_NNP']
VP = ['_VBT _NN', '_MD _VBT _NN', '_VBT _NN _DT', '_VBT _NN _SC _JJ', '_VBT _NN _JJ', 
      '_RB _VBT _NP', '_VBT _RB _JJ', '_VBT _NN _NN', '_SC _VBT _NP', '_VBI']

In [6]:
def get_list_tag(dict_words_by_tag):
    result = []
    for key in dict_words_by_tag:
        result.append(key)
    return result

In [7]:
def generate_base_grammar(list_tag):
    result = {}
    
    if '_VBT' not in list_tag:
        list_tag.append('_VBT')
    
    if '_VBI' not in list_tag:
        list_tag.append('_VBI')
#     print(list_tag)
    S = {"_S": ["_NP _VP"]}
    PP = {"_PP": ["_IN _NP"]}
    
    if '_JJ' in list_tag:
        NP_RULES = generate_NP(list_tag)
        NP = {"_NP": NP_RULES}
        if check_VP(list_tag):
            VP_RULES = ['_JJ'] + generate_VP(list_tag)
            VP = {"_VP": VP_RULES}
        else:
            VP = {"_VP": ['_JJ', '_PP']}

        for r in [S, NP, VP, PP]:
            result.update(r)
        return result
    else:
        NP_RULES = remove_JJ(generate_NP(list_tag))
        NP = {"_NP": NP_RULES}
        if check_VP(list_tag):
            VP_RULES = remove_JJ(generate_VP(list_tag))
            VP = {"_VP": VP_RULES}
        else:
            VP = {"_VP": ['_PP']}
            
        for r in [S, NP, VP, PP]:
            result.update(r)
        return result

def generate_NP(list_tag):
    result = []
    for tag in list_tag:
        for words in NP:
            if re.search(r'\b' + tag + r'\b', words):
                result.append(words)
    return list(OrderedDict.fromkeys(result))
    
def check_VP(list_tag):
    for tag in list_tag:
        if 'V' in tag:
            return True
    return False

def generate_VP(list_tag):
    result = []
    for tag in list_tag:
        for words in VP:
            if re.search(r'\b' + tag + r'\b', words):
                result.append(words)
    return list(OrderedDict.fromkeys(result))

def remove_JJ(list_tag):
    result = []
    for tag in list_tag:
        if '_JJ' in tag:
            continue
        else:
            result.append(tag)
    return result

In [8]:
def generate_words_grammar(dict_words_by_tag):
    result = {}
    IN = {"_IN": ['di', 'dengan', 'untuk']}
    CC = {"_CC": ['dan', 'mau']}
    SC = {"_SC": ['yang']}
    RB = {"_RB": ['sedang', 'sambil', 'masih', 'lagi']}
    DT = {"_DT": ['itu']}
    MD = {"_MD": ['bisa', 'telah', 'sudah']}
    ADD_VBT = {"_VBT": ['adalah', 'ingin', 'pengin']}
    ADD_VBI = {"_VBI": ['ada', 'suka']}
    
    WORDS = dict_words_by_tag
    if '_VBT' in WORDS:
        for word in ADD_VBT["_VBT"]:
            if word not in WORDS['_VBT']:
                WORDS['_VBT'].append(word)
    else:
        WORDS.update(ADD_VBT)
    
    if '_VBI' in WORDS:
        for word in ADD_VBI["_VBI"]:
            if word not in WORDS['_VBI']:
                WORDS['_VBI'].append(word)
    else:
        WORDS.update(ADD_VBI)  
            
    for r in [IN, CC, SC, RB, DT, MD, WORDS]:
        result.update(r)
    return result

In [9]:
def organize_words_by_tag(list_words):
    result = defaultdict(list)
    
    i = []
    for s in list_words:
        word, pwz = s[0], s[1]
        
        wrd = word.split('/')[0]
        tag = word.split('/')[1]
        result['_'+tag].append([wrd, pwz])
    return dict(result)

In [10]:
def split_word_pwz(dict_words_pwz_by_tag):
    dict_word_by_tag = defaultdict(list)
    dict_pwz_by_tag = defaultdict(list)
    
    for key, values in dict_words_pwz_by_tag.items():
        for data in values:
            word, pwz = data[0], data[1]
            dict_word_by_tag[key].append(word)
            dict_pwz_by_tag[key].append(pwz)
    return dict(dict_word_by_tag), dict(dict_pwz_by_tag)

In [11]:
def create_grammar(dict_words_by_topic):
    grammar = {}
    
    dict_words_pwz_by_tag = organize_words_by_tag(dict_words_by_topic)
    list_tag = get_list_tag(dict_words_pwz_by_tag)
    base_grammar = generate_base_grammar(list_tag)
    dict_word_by_tag, dict_pwz_by_tag = split_word_pwz(dict_words_pwz_by_tag)
    words_grammar = generate_words_grammar(dict_word_by_tag)
    
    for r in [base_grammar, words_grammar]:
        grammar.update(r)
    return grammar, dict_pwz_by_tag 

In [12]:
def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

In [13]:
def is_terminal(token):
    return token[0] != "_"

In [14]:
sys_random = random.SystemRandom()

def expand(grammar, tokens, dict_pwz_by_tag):
#     print(tokens)
    for i, token in enumerate(tokens):

        # skip over terminals
        if is_terminal(token): continue

        # if we get here, we found a non-terminal token
        # so we need to choose a replacement at random
        replacement = sys_random.choice(grammar[token])
        
        if replacement == '_NN':
            weight = [x/sum(dict_pwz_by_tag['_NN']) for x in dict_pwz_by_tag['_NN']]
            replacement = nr.choice(grammar['_NN'], p=weight)
            print(weight)
#             print(grammar['_NN'], weight)
            
        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
       
        # now call expand on the new list of tokens
        return expand(grammar, tokens, dict_pwz_by_tag)

    # if we get here we had all terminals and are done
    return tokens

In [15]:
def generate_sentence(grammar, dict_pwz_by_tag):
    return expand(grammar, ["_S"], dict_pwz_by_tag)

In [16]:
def create_sentences_from_data(dict_data):
    result = {}
    for topic, words in dict_data.items():
        sentence = []
        grammar, dict_pwz_by_tag = create_grammar(words)
        print(grammar)
        for s in range(100):
            sentence.append(' '.join(generate_sentence(grammar, dict_pwz_by_tag)))
        result = merge_two_dicts(result, {topic: sentence})
    return result

In [17]:
dict_story = create_sentences_from_data(document)

{'_S': ['_NP _VP'], '_NP': ['_NN'], '_VP': ['_JJ', '_VBT _NN', '_MD _VBT _NN', '_VBT _NN _DT', '_VBT _NN _SC _JJ', '_VBT _NN _JJ', '_RB _VBT _NP', '_VBT _RB _JJ', '_VBT _NN _NN', '_SC _VBT _NP', '_VBI'], '_PP': ['_IN _NP'], '_IN': ['di', 'dengan', 'untuk'], '_CC': ['dan', 'mau'], '_SC': ['yang'], '_RB': ['sedang', 'sambil', 'masih', 'lagi'], '_DT': ['itu'], '_MD': ['bisa', 'telah', 'sudah'], '_VBT': ['memotret', 'berbarengan', 'adalah', 'ingin', 'pengin'], '_NN': ['akses', 'pintu', 'bawah', 'tanah', 'aji', 'fox'], '_JJ': ['lemas'], '_VBI': ['ada', 'suka']}
[0.2857142857142857, 0.21428571428571425, 0.21428571428571425, 0.14285714285714285, 0.07142857142857142, 0.07142857142857142]
[0.2857142857142857, 0.21428571428571425, 0.21428571428571425, 0.14285714285714285, 0.07142857142857142, 0.07142857142857142]
[0.2857142857142857, 0.21428571428571425, 0.21428571428571425, 0.14285714285714285, 0.07142857142857142, 0.07142857142857142]
[0.2857142857142857, 0.21428571428571425, 0.214285714285714

In [18]:
dict_story

{0: ['akses sudah adalah pintu',
  'akses lemas',
  'aji pengin lagi lemas',
  'akses masih ingin bawah',
  'akses ingin bawah itu',
  'tanah berbarengan aji lemas',
  'pintu adalah sambil lemas',
  'bawah ada',
  'akses ada',
  'bawah memotret masih lemas',
  'tanah lemas',
  'fox berbarengan masih lemas',
  'pintu memotret bawah aji',
  'tanah pengin sedang lemas',
  'pintu sedang ingin pintu',
  'pintu berbarengan aji itu',
  'aji memotret aji lemas',
  'pintu ingin pintu bawah',
  'akses lemas',
  'aji yang adalah pintu',
  'pintu yang ingin akses',
  'akses ingin fox yang lemas',
  'aji suka',
  'akses memotret akses itu',
  'pintu lagi pengin akses',
  'tanah masih adalah pintu',
  'tanah ingin pintu yang lemas',
  'akses sambil pengin aji',
  'tanah lemas',
  'tanah lemas',
  'akses adalah lagi lemas',
  'tanah ada',
  'akses adalah fox lemas',
  'akses bisa berbarengan fox',
  'pintu masih adalah pintu',
  'pintu lemas',
  'aji ingin fox pintu',
  'pintu yang memotret bawah',
 