# Goal 

**Inputs:**

1. Take an input context text 
2. Take input sentence

**Backend:**

1. Convert input(1) to tokens 
2. wikpedia_topic_mode(tokens) -> likely_topics
3. wikpedia_topic_mode(likely_topics) -> pun_possible_words
4. pun.insert_pun( input(2), pun_possible_words ) -> new sentence

**Output:**

1. Print new sentence



In [1]:
# imports 
import pun
import pickle
from gensim.models.ldamodel import LdaModel as Lda
from gensim import corpora
from gensim.models import doc2vec
from gensim.utils import simple_preprocess

In [2]:
# Load the model

wiki_topicmodel = Lda.load('models/180924_wikipedia_model.stemmed.individually_binned.200.gensim.')

# Loading the doc2vec

wiki_doc2vec = doc2vec.Doc2Vec.load('models/simple_wiki_chunked_doc2vec')

# loading the doc2vec corpus 

with open('models/simple_wiki_chunked_corpus.p', 'rb') as tounpcik:
    wiki_doc2vec_corpus = pickle.load(tounpcik)

# Dealing with words from topic model

In [29]:
def get_words_from_top_topics(topic_list, model, min_topic_prob=0.1, min_word_prob=0.05):
    """
    First finding all of the words
    """
    
    list_of_words = []
    
    topic_list.sort(key=lambda tup: tup[1], reverse=True)
    
    for topic, topic_prob in topic_list:
        
        if topic_prob < min_topic_prob:
            break
        
        for word_id, word_prob in model.get_topic_terms(topic, 100):
            if word_prob < min_word_prob:
                break
            list_of_words.append(model.id2word[word_id])
    
    return list_of_words

In [41]:
def get_words_from_top_topic(topic_list, model, min_word_prob=0.05):
    """
    First finding all of the words
    """
    
    list_of_words = []
    topic_list.sort(key=lambda tup: tup[1], reverse=True)
    
    topic, topic_prob = topic_list[0]
        
    for word_id, word_prob in model.get_topic_terms(topic, 100):
        if word_prob < min_word_prob:
            break
        list_of_words.append(model.id2word[word_id])
    
    return list_of_words, topic_prob

In [42]:
# Define the context

context = 'purr'
context = "Climbing is the activity of using one's hands, feet, or any other part of the body to ascend a steep object. It is done for locomotion, recreation and competition, in trades that rely on it, and in emergency rescue and military operations. It is done indoors and out, on natural and man-made structures."
# context = "The domestic cat (Felis silvestris catus or Felis catus)[1][4] is a small, typically furry, carnivorous mammal. They are often called house cats[5] when kept as indoor pets or simply cats when there is no need to distinguish them from other felids and felines. They are often valued by humans for companionship and for their ability to hunt vermin. There are more than seventy cat breeds recognized by various cat registries."

# Tokenize context 

context_tokens = pun.tokenize(context)

# Convert to bow

bag_of_words = wiki_topicmodel.id2word.doc2bow(context_tokens)

In [43]:
def sentence_to_topicmodel_words(sentence, model):
    context_tokens = pun.tokenize(sentence)
    bag_of_words = model.id2word.doc2bow(context_tokens)
    document_topics = model.get_document_topics(bag_of_words) 
    return get_words_from_top_topic(document_topics, model)

In [44]:
sentence_to_topicmodel_words(context, wiki_topicmodel)

(['report', 'news', 'host', 'activ'], 0.11138889)

# Dealing with doc2vec information

In [36]:
def sentence_to_doc2vec(text, model):
    """
    Iterator which spits out words that are found in 
    the most 'topical' textual elements
    """
    # parse the sentence 
    text = simple_preprocess(text)
    # Find the respective doc2vec vector
    text_vector = model.infer_vector(text)
    # find the most similar text pieces
    most_similar_documents_with_score = model.docvecs.most_similar([text_vector])
    
    for document_id, cosine_sim_score in most_similar_documents_with_score:
        
        yield (set(pun.tokenize(wiki_doc2vec_corpus[document_id], stem=False, initial_word_split=False)), cosine_sim_score)
        
    

In [37]:
output = sentence_to_doc2vec("Climbing is the activity of using one's hands, feet, or any other part of the body to ascend a steep object. It is done for locomotion, recreation and competition, in trades that rely on it, and in emergency rescue and military operations. It is done indoors and out, on natural and man-made structures.",
                             wiki_doc2vec)

In [38]:
next(output)

  if np.issubdtype(vec.dtype, np.int):


({'boulder',
  'case',
  'climb',
  'climber',
  'drop',
  'gym',
  'harness',
  'heights',
  'injuries',
  'practice',
  'protect',
  'rock',
  'rope',
  'suffer',
  'type',
  'usually'},
 0.6954768896102905)

# Builiding how it is all connected

In [None]:
context = "Climbing is the activity of using one's hands, feet, or any other part of the body to ascend a steep object. It is done for locomotion, recreation and competition, in trades that rely on it, and in emergency rescue and military operations. It is done indoors and out, on natural and man-made structures."

input_sentence = "There are people who can do and people who cant"

In [46]:
# First process context
doc2vec_word_generator = sentence_to_doc2vec(context, wiki_doc2vec)
topic_words = sentence_to_topicmodel_words(context, wiki_topicmodel)

# Then try to generate sentences using these metrics 



In [45]:
pun.insert_pun(input_sentence, words)

NameError: name 'input_sentence' is not defined