# Goal 

**Inputs:**

1. Take an input context text 
2. Take input sentence

**Backend:**

1. Convert input(1) to tokens 
2. wikpedia_topic_mode(tokens) -> likely_topics
3. wikpedia_topic_mode(likely_topics) -> pun_possible_words
4. pun.insert_pun( input(2), pun_possible_words ) -> new sentence

**Output:**

1. Print new sentence



In [1]:
# imports 
import pun
import pickle
from gensim.models.ldamodel import LdaModel as Lda
from gensim import corpora
from gensim.models import doc2vec
from gensim.utils import simple_preprocess

In [4]:
# Load the model

wiki_topicmodel = Lda.load('models/180925_wikipedia_model.individually_binned.200.gensim.')

# loading the stemmed_dict

with open('180922_stemmed_dict.p', 'rb') as tounpick:
    stemmed_dict = pickle.load(tounpick)

# Loading the doc2vec

wiki_doc2vec = doc2vec.Doc2Vec.load('models/simple_wiki_chunked_doc2vec_300_vector_10_min_word')

# loading the doc2vec corpus 

with open('models/simple_wiki_chunked_corpus_10_count_cutoff.p', 'rb') as tounpcik:
    wiki_doc2vec_corpus = pickle.load(tounpcik)

# Dealing with words from topic model

In [5]:
def get_words_from_top_topics(topic_list, model, min_topic_prob=0.1, min_word_prob=0.05):
    """
    First finding all of the words
    """
    
    list_of_words = []
    
    topic_list.sort(key=lambda tup: tup[1], reverse=True)
    
    for topic, topic_prob in topic_list:
        
        if topic_prob < min_topic_prob:
            break
        
        for word_id, word_prob in model.get_topic_terms(topic, 100):
            if word_prob < min_word_prob:
                break
            list_of_words.append(model.id2word[word_id])
    
    return list_of_words

In [6]:
def get_words_from_top_topic(topic_list, model, min_word_prob=0.05):
    """
    First finding all of the words
    """
    
    list_of_words = []
    topic_list.sort(key=lambda tup: tup[1], reverse=True)
    
    topic, topic_prob = topic_list[0]
        
    for word_id, word_prob in model.get_topic_terms(topic, 100):
        if word_prob < min_word_prob:
            break
        if model.id2word[word_id] in stemmed_dict:
            for word in stemmed_dict[model.id2word[word_id]]:
                list_of_words.append(word)
        else:
            list_of_words.append(model.id2word[word_id])

    
    return list_of_words, topic_prob

In [7]:
# Define the context

context = 'purr'
context = "Climbing is the activity of using one's hands, feet, or any other part of the body to ascend a steep object. It is done for locomotion, recreation and competition, in trades that rely on it, and in emergency rescue and military operations. It is done indoors and out, on natural and man-made structures."
# context = "The domestic cat (Felis silvestris catus or Felis catus)[1][4] is a small, typically furry, carnivorous mammal. They are often called house cats[5] when kept as indoor pets or simply cats when there is no need to distinguish them from other felids and felines. They are often valued by humans for companionship and for their ability to hunt vermin. There are more than seventy cat breeds recognized by various cat registries."

# Tokenize context 

context_tokens = pun.tokenize(context)

# Convert to bow

bag_of_words = wiki_topicmodel.id2word.doc2bow(context_tokens)

In [8]:
def sentence_to_topicmodel_words(sentence, model):
    context_tokens = pun.tokenize(sentence)
    bag_of_words = model.id2word.doc2bow(context_tokens)
    document_topics = model.get_document_topics(bag_of_words) 
    return get_words_from_top_topic(document_topics, model)

In [9]:
sentence_to_topicmodel_words(context, wiki_topicmodel)

(['wear', 'feet', 'walk'], 0.12638511)

# Dealing with doc2vec information

In [10]:
def sentence_to_doc2vec(text, model):
    """
    Iterator which spits out words that are found in 
    the most 'topical' textual elements
    """
    # parse the sentence 
    text = simple_preprocess(text)
    # Find the respective doc2vec vector
    text_vector = model.infer_vector(text)
    # find the most similar text pieces
    most_similar_documents_with_score = model.docvecs.most_similar([text_vector])
    
    for document_id, cosine_sim_score in most_similar_documents_with_score:
        
        yield (set(pun.tokenize(wiki_doc2vec_corpus[document_id], stem=False, initial_word_split=False)), cosine_sim_score)
        
    

In [11]:
output = sentence_to_doc2vec("Climbing is the activity of using one's hands, feet, or any other part of the body to ascend a steep object. It is done for locomotion, recreation and competition, in trades that rely on it, and in emergency rescue and military operations. It is done indoors and out, on natural and man-made structures.",
                             wiki_doc2vec)

In [15]:
output

<generator object sentence_to_doc2vec at 0x7f62a6ff61a8>

In [12]:
next(output)

  if np.issubdtype(vec.dtype, np.int):


({'examples',
  'harm',
  'include',
  'inhibition',
  'organism',
  'plant',
  'relationship',
  'secretions',
  'shade',
  'taller',
  'wider'},
 0.36731237173080444)

# Builiding how it is all connected

In [13]:
def generate_possible_pun_substitutions(context, input_sentence, w2v_number=4):
    """
    Takes context and input sentence 
    
    returns list of possible substitutions with scores and the topic 
    words consdiered

    """
    
    # First process context
    doc2vec_word_generator = sentence_to_doc2vec(context, wiki_doc2vec)
    topic_words, topic_score = sentence_to_topicmodel_words(context, wiki_topicmodel)

    # Then try to generate sentences using these metrics
    output = []
    topic_words_considered = []
    
    # consider word2vec words
    for i in range(w2v_number):
        words, w2v_score = next(doc2vec_word_generator)

        topic_words_considered.extend([[word, 'doc2vec', i+1, w2v_score] for word in words])

        sub_tuples = pun.enumerate_PD_pun_subs(input_sentence, words)
        # word, sub_index, phonetic_distance

        for word, sub_index, phon_dist in sub_tuples:
            output.append([word, sub_index, phon_dist, 'doc2vec', i+1, w2v_score, phon_dist/w2v_score])

    # Now do topic words
    sub_tuples = pun.enumerate_PD_pun_subs(input_sentence, topic_words)
    for word, sub_index, phon_dist in sub_tuples:

        topic_words_considered.append([word, 'topicModel', 1, topic_score])

        output.append([word, sub_index, phon_dist, 'topicModel', 1, topic_score, phon_dist/topic_score])

    output.sort(key=lambda x: x[6])
    
    return output, topic_words_considered


In [14]:
# context = "Climbing is the activity of using one's hands, feet, or any other part of the body to ascend a steep object. It is done for locomotion, recreation and competition, in trades that rely on it, and in emergency rescue and military operations. It is done indoors and out, on natural and man-made structures."
context = "Prince of the United Kingdom of Great Britain and Northern Ireland is a royal title normally granted to sons and grandsons of reigning and past British monarchs. It is also held by the Duke of Edinburgh, husband and consort of Queen Elizabeth II. The title is granted by the reigning monarch, who is the fount of all honours, through the issuing of letters patent as an expression of the royal will."

input_sentence = "join us at our gym we hope to see you there"

In [16]:
ranked_substitutions, topic_words_considered = generate_possible_pun_substitutions(context, input_sentence)

  if np.issubdtype(vec.dtype, np.int):


In [17]:
for thing in [pun.substitute_pun(input_sentence, x[:3]) for x in ranked_substitutions[:10]]:
    print(thing)

join us at flower gym we hope to see you there
join us at our gym we hope to see use there
join us at our gym we hope to see you wear
join us at our gym we hope to sir you there
join us at our gym see hope to see you there
join us at our gym way hope to see you there
join use at our gym we hope to see you there
join us at our gym queen hope to see you there
join sir at our gym we hope to see you there
join pass at our gym we hope to see you there


In [18]:
ranked_substitutions

[['flower', 3, 2, 'doc2vec', 1, 0.7421815395355225, 2.6947584835533025],
 ['use', 9, 2, 'doc2vec', 1, 0.7421815395355225, 2.6947584835533025],
 ['wear', 10, 2, 'doc2vec', 1, 0.7421815395355225, 2.6947584835533025],
 ['sir', 8, 2, 'doc2vec', 2, 0.7250586152076721, 2.758397677168704],
 ['see', 5, 2, 'doc2vec', 3, 0.7234101295471191, 2.764683432414849],
 ['way', 5, 2, 'doc2vec', 3, 0.7234101295471191, 2.764683432414849],
 ['use', 1, 3, 'doc2vec', 1, 0.7421815395355225, 4.042137725329954],
 ['queen', 5, 3, 'doc2vec', 1, 0.7421815395355225, 4.042137725329954],
 ['sir', 1, 3, 'doc2vec', 2, 0.7250586152076721, 4.1375965157530565],
 ['pass', 1, 3, 'doc2vec', 2, 0.7250586152076721, 4.1375965157530565],
 ['man', 2, 3, 'doc2vec', 2, 0.7250586152076721, 4.1375965157530565],
 ['pass', 2, 3, 'doc2vec', 2, 0.7250586152076721, 4.1375965157530565],
 ['knight', 2, 3, 'doc2vec', 2, 0.7250586152076721, 4.1375965157530565],
 ['see', 1, 3, 'doc2vec', 3, 0.7234101295471191, 4.147025148622274],
 ['pass', 1, 3

In [19]:
topic_words_considered

[['prize', 'doc2vec', 1, 0.7421815395355225],
 ['use', 'doc2vec', 1, 0.7421815395355225],
 ['monarchy', 'doc2vec', 1, 0.7421815395355225],
 ['head', 'doc2vec', 1, 0.7421815395355225],
 ['chain', 'doc2vec', 1, 0.7421815395355225],
 ['choose', 'doc2vec', 1, 0.7421815395355225],
 ['receive', 'doc2vec', 1, 0.7421815395355225],
 ['crown', 'doc2vec', 1, 0.7421815395355225],
 ['queen', 'doc2vec', 1, 0.7421815395355225],
 ['win', 'doc2vec', 1, 0.7421815395355225],
 ['example', 'doc2vec', 1, 0.7421815395355225],
 ['couple', 'doc2vec', 1, 0.7421815395355225],
 ['monarch', 'doc2vec', 1, 0.7421815395355225],
 ['girl', 'doc2vec', 1, 0.7421815395355225],
 ['winners', 'doc2vec', 1, 0.7421815395355225],
 ['flower', 'doc2vec', 1, 0.7421815395355225],
 ['dance', 'doc2vec', 1, 0.7421815395355225],
 ['contest', 'doc2vec', 1, 0.7421815395355225],
 ['pageant', 'doc2vec', 1, 0.7421815395355225],
 ['occasion', 'doc2vec', 1, 0.7421815395355225],
 ['call', 'doc2vec', 1, 0.7421815395355225],
 ['headdress', 'doc2

In [24]:
def convert_topic_words_to_print(topic_words):
    output = [[]]
    topic_number = 1
    for topic in topic_words:
        if topic[1] == 'topicModel':
            return output
        if topic[2] != topic_number:
            topic_number = topic[2]
            output.append([topic[0]])
        output[topic_number-1].append(topic[0])
    
    return output

In [25]:
convert_topic_words_to_print(topic_words_considered)

[['prize',
  'use',
  'monarchy',
  'head',
  'chain',
  'choose',
  'receive',
  'crown',
  'queen',
  'win',
  'example',
  'couple',
  'monarch',
  'girl',
  'winners',
  'flower',
  'dance',
  'contest',
  'pageant',
  'occasion',
  'call',
  'headdress',
  'nothing',
  'symbol',
  'king',
  'wear',
  'part',
  'award',
  'type',
  'daisy',
  'beauty',
  'children'],
 ['chivalry',
  'chivalry',
  'title',
  'rank',
  'members',
  'baronets',
  'sir',
  'barons',
  'kingdom',
  'baronet',
  'order',
  'bachelor',
  'man',
  'pass',
  'children',
  'knight'],
 ['sister',
  'sister',
  'primogeniture',
  'inheritance',
  'monarchy',
  'see',
  'pass',
  'child',
  'cousin',
  'throne',
  'question',
  'oldest',
  'command',
  'law',
  'crown',
  'quit',
  'create',
  'beforehand',
  'monarch',
  'family',
  'time',
  'monarchies',
  'know',
  'monarchs',
  'idea',
  'system',
  'member',
  'past',
  'king',
  'arrange',
  'way',
  'nowadays',
  'methods',
  'die',
  'succession',
  'b

# Implementing TF-IDF

In [18]:
tokenized_corpus = [pun.tokenize(thing, stem=False, initial_word_split=False) for thing in wiki_doc2vec_corpus]

In [16]:
list_of_words = [word[0] for word in topic_words_considered]

In [25]:
pun.pos_tag(['choose'])

[('choose', 'NN')]

1. right topic 
2. Does it sound similar 

implement live validation *********

Conversational corpus 

multi-arm banding 


