In [None]:
##### Using pip install to obtain the necessary modules to use
!pip install --upgrade language_tool_python
!pip install gingerit
!pip install stanfordnlp
!pip install nlpcloud



In [None]:
import numpy
#import gensim
from gensim.models import Word2Vec
from zipfile import ZipFile
from collections import defaultdict
import random
import json
import nltk
import language_tool_python as lang
from gingerit.gingerit import GingerIt
import spacy
import stanfordnlp as stanNLP
nltk.download("averaged_perceptron_tagger")

import gensim.downloader
import nlpcloud

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
##print(gensim.downloader.info()['models'].keys())
anotherModel = gensim.downloader.load("glove-wiki-gigaword-100")  ##downloading a glove model



In [None]:
anotherModel.doesnt_match(['king','horatio','hamlet','macbeth','shakespeare'])  ##shows which word does not belong to a list

'horatio'

In [None]:
!wget "http://cslab.valpo.edu/~mglass/AllShakespeare.txt"  ## downloading the shakespearean text.

--2023-08-07 19:48:37--  http://cslab.valpo.edu/~mglass/AllShakespeare.txt
Resolving cslab.valpo.edu (cslab.valpo.edu)... 152.228.81.102
Connecting to cslab.valpo.edu (cslab.valpo.edu)|152.228.81.102|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5663512 (5.4M) [text/plain]
Saving to: ‘AllShakespeare.txt’


2023-08-07 19:48:38 (10.9 MB/s) - ‘AllShakespeare.txt’ saved [5663512/5663512]



In [None]:
##### The punctuation variable is used to remove the punctuations in the corpus.
punct = '!"#$%&\'()*+,-./:;<=>?&[\\]~_`{|}~\u2018\u2019\u201c\u201d\u2014'

##### This method creates the corpus that will be used for n-grams. The file used to create the corpus is first read. This returns a list of lines that will be helpful in lowercasing each
##### line, replacing the em dash with the new line, and spliting the line. The line is then added to the temporary list and is checked to see if the length of the sentence is not zero.
##### If it is not, the list of lines will include the new line without any punctuations.
def nGramsCorpus(file):
  tempList = list()
  listOfLines = list()
  with open(file,encoding='UTF-8 SIG') as shakespeareFile:
    text = shakespeareFile.readlines()
  for line in text:
    tempList.append(line.lower().replace("\u2014","\n").split())
  for sentence in tempList:
    if(len(sentence) != 0):
      newSentence = [word.strip(punct) for word in sentence]
      listOfLines.append(newSentence)
  return listOfLines

##### This method adds the sentence markers ("<s>" and "</s>") to each line. The another
##### list will append the line. It returns the list that contains lines with sentence markers.
def addSentenceMarkers(textToAddMarkers):
  finalList = list()
  for line in textToAddMarkers:
    line.insert(0,"<s>")
    line.insert(len(line),"</s>")
    finalList.append(line)
  return finalList

##### This will count the number of unigrams in the text. It returns the unigram dictionary where it details the word and the number of times the word appears.
def unigram_count(text):
  unigram_dict = defaultdict(int)
  for sentence in text:
    for term in sentence:
      unigram_dict[term] += 1
  return unigram_dict

##### This will count the number of bigrams in the text. It returns the bigram dictionary, which contains the phrase with two words and the total number of appearances of the phrases.
def bigram_count(unigram,text):
  bigram_dict = dict([(word,defaultdict(int)) for word in unigram])
  for sentence in text:
    for index in range(len(sentence)-1):
      firstWord = sentence[index]
      secondWord = sentence[index+1]
      bigram_dict[firstWord][secondWord] += 1
  return bigram_dict

##### This will count the number of trigrams in the text. It will return the trigram dictionary with the phrase of three words and the number of its appearances in the corpus.
##### Unlike the other two methods, the trigram count method uses the for-loop to create a default dictionary for each three-word phrase.
def trigram_count(unigram,bigram,text):
  trigram_dict = {}
  for word in unigram:
    trigram_dict[word] = {}
    for second_words in bigram[word]:
      trigram_dict[word][second_words] = defaultdict(int)
  for line in text:
    for i in range(len(line)-2):
      first_word = line[i]
      second_word = line[i+1]
      third_word = line[i+2]
      trigram_dict[first_word][second_word][third_word] += 1
  return trigram_dict

##### This determines the next word based on the last word. First, the goal number is picked between the range of one and the total number of times where the word appears.
##### The for-loop is used to add the count to the index and determine if the index is greater or equal to the goal number. If it is, it will return a word.
def bigram_next_word(unigram,bigram,word): ## based on Shakespeare bigram generator
  total_number = unigram[word]
  goal_number = random.randint(1,total_number)
  index = 0
  for w,c in bigram[word].items():
    index += c
    if(index >= goal_number):
      return w

##### This will generate bigrams and returns a line based on the bigrams.
def generate_bigram(u,b):
  line = ["<s>"]
  while(line[-1] != "</s>"):
    nextWord = bigram_next_word(u,b,line[-1])
    line.append(nextWord)
  return line

##### This "predicts" the next word according to the previous two words. Similar to the bigram next word method,
##### it will select a random integer as the goal number. Once the index reaches to the goal number (or exceeds it), the method
##### will return a word.
def trigram_next_word(bigramDict,triDict,word,word2):
  total = bigramDict[word][word2]
  goal = random.randint(1,total)
  index = 0
  for w,c in triDict[word][word2].items():
    index += c
    if (index >= goal):
      return w

##### This will produce trigrams and return a sentence.
def generate_trigram(unigramDict,bigramDict,triDict):
  line = ["<s>"]
  while(line[-1] != "</s>"):
    if(len(line)<2):
      secondWord = bigram_next_word(unigramDict,bigramDict,line[-1])
      line.append(secondWord)
    thirdWord = trigram_next_word(bigramDict,triDict,line[-2],line[-1])
    line.append(thirdWord)
  return line

random.seed('shakespeare')

nGramCorpus = nGramsCorpus('AllShakespeare.txt')
nGramCorpus = addSentenceMarkers(nGramCorpus)

unigram = unigram_count(nGramCorpus)
bigram = bigram_count(unigram, nGramCorpus)
trigram = trigram_count(unigram,bigram,nGramCorpus)

##print(trigram_next_word(bigram,trigram,'means','your'))

print("Trigram")
for i in range(30):
  print(" ".join(generate_trigram(unigram,bigram,trigram)))
print()
print("Bigram")
for i in range(30):
  print(" ".join(generate_bigram(unigram,bigram)))

Trigram
<s> as i am today i th’vein of chivalry </s>
<s> if you be safer </s>
<s> then to thee </s>
<s> how now a wood near athens </s>
<s> lear </s>
<s> therefore to our rose of youth </s>
<s> exeunt </s>
<s> a dangerous law against it </s>
<s> in any case not that their first of manhood stand upright </s>
<s> subdu’d me </s>
<s> great timon noble worthy royal timon </s>
<s> cinna             4d </s>
<s> ever true in me else </s>
<s> what would come against us like the heaven's glorious sun </s>
<s> warwick </s>
<s> that i will proclaim you out </s>
<s> pass with a hovel </s>
<s> jessica </s>
<s> i’ll help it now appears you need not be </s>
<s> tis true </s>
<s> enter the two or three servants </s>
<s> the devil his true tears all bewet </s>
<s> finds bottom in th’uncomprehensive deeps </s>
<s> we laugh say when i told you sir dost know this and dwell upon your brows </s>
<s> i mean of us o tell me john hath reconcil’d </s>
<s> york </s>
<s> so i know thee to be most reverend signior

In [None]:
##punct = '!"#$%&\'()*+,-./:;<=>?&[\\]~_`{|}~\u2018\u2019\u201c\u201d\u2014'

##### This method is commented out since the nGramsCorpus method is also a viable option of creating the corpus with the improved sentences. This method is similar
##### to the previous method nGramsCorpus.
##def nGramsCorpusWithImprovedText(fileWithImprovedText):
  ##temporaryList = list()
  ##listOfSentences = list()
  ##with open(fileWithImprovedText,encoding="UTF-8") as file:
  ##  lineList = file.readlines()
  ##for line in lineList:
  ##  line = line.lower().strip().strip(punct).replace("\u2014","\n").split()
  ##  temporaryList.append(line)
  ##for sentence in temporaryList:
  ##  newSentence = [word.strip(punct) for word in sentence]
  ##  listOfSentences.append(newSentence)
  ##return listOfSentences

##### This will print out the generated trigrams and screen out the trigrams with words less than 3.
def printingTrigrams(unigram,bigram,trigram,iterations):
  lineList = list()
  for i in range(iterations):
    trigramLine = generate_trigram(unigram,bigram,trigram)
    if(len(trigramLine) == 3):
      while(len(trigramLine)==3):
        trigramLine = generate_trigram(unigram,bigram,trigram)
    lineList.append(trigramLine)
    print(trigramLine)
  ##return lineList

improvedShakespeareText = nGramsCorpus("ShakespeareSentences.txt")
improvedShakespeareText = addSentenceMarkers(improvedShakespeareText)
newUnigram = unigram_count(improvedShakespeareText)
newBigram = bigram_count(newUnigram,improvedShakespeareText)
newTrigram = trigram_count(newUnigram, newBigram, improvedShakespeareText)

random.seed('romeo and juliet')
print("###New Trigram#####")
printingTrigrams(newUnigram,newBigram,newTrigram,10)

###New Trigram#####
['<s>', 'she', 'eat', 'no', 'more', '</s>']
['<s>', 'send', 'me', 'to', 'see’t', '</s>']
['<s>', 'he', 'loves', 'caesar', '</s>']
['<s>', 'are', 'you', 'not', 'heard', 'from', 'him', '</s>']
['<s>', 'go', 'to', 'bed', 'after', 'midnight', '</s>']
['<s>', 'camp', 'of', 'the', 'king', 'as', 'i', 'would', 'cudgel', 'him', 'and', 'he', 'beseech’d', 'me', 'to', 'my', 'affection', 'so', 'far', 'deceived', 'in', 'him', 'master', 'of', 'fence-three', 'veneys', 'for', 'a', 'while', 'what', 'will', 'it', 'would', 'be', 'here', 'though', 'thou', 'stand’st', 'a', 'breathing', '</s>']
['<s>', 'heaven', 'that', 'henry’s', 'dead', '</s>']
['<s>', 'henry', 'the', 'fifth', 'attended', 'on', 'by', 'night', 'unseen', 'yet', 'crescive', 'in', 'his', 'monarchy', '</s>']
['<s>', 'therefore', 'another', 'prologue', 'must', 'tell', 'thee', 'daughter', 'for', 'a', 'pad', 'conscience', '</s>']
['<s>', 'i', 'trust', 'shall', 'witness', 'live', 'in', 'your', 'weakness', 'strength', 'unto', 'yo

In [None]:
random.seed("shakespeare")

sentence = generate_trigram(unigram,bigram,trigram)
print(sentence)
print(nltk.pos_tag(sentence[1:-1]))

for w in sentence[1:-1]:
  print(nltk.pos_tag([w]))

['<s>', 'as', 'i', 'am', 'today', 'i', 'th’vein', 'of', 'chivalry', '</s>']
[('as', 'IN'), ('i', 'NN'), ('am', 'VBP'), ('today', 'NN'), ('i', 'JJ'), ('th’vein', 'NN'), ('of', 'IN'), ('chivalry', 'NN')]
[('as', 'IN')]
[('i', 'NN')]
[('am', 'VBP')]
[('today', 'NN')]
[('i', 'NN')]
[('th’vein', 'NN')]
[('of', 'IN')]
[('chivalry', 'NN')]


In [None]:
corpusModel = gensim.models.Word2Vec(nGramCorpus)

In [None]:
sonnetCorpus = nGramsCorpus('sonnets.txt')
sonnetCorpus = addSentenceMarkers(sonnetCorpus)
sonnetModel = gensim.models.Word2Vec(sonnetCorpus)

sonnetModel.wv.most_similar('rose')

[('whose', 0.9954397082328796),
 ('she', 0.9953383803367615),
 ('my', 0.9952844977378845),
 ('black', 0.9951425194740295),
 ('their', 0.9951171278953552),
 ('so', 0.9951009154319763),
 ('one', 0.9950540661811829),
 ('before', 0.9949631690979004),
 ('no', 0.9949607253074646),
 ('in', 0.9949582815170288)]

In [None]:
random.seed('shakespeare')

##### This displays the possible words based on the random word chosen. If the Word2Vec model does not have the random-selected word, it
##### might either examine if the Word2Vec model contains keyed vectors. If it does not, the method might resort to use the "error message."
##### The method uses the nltk pos-tagger to classify each word's part of speech from the list produced by the Word2Vec. If the current part
##### of speech matches to the target one, the word will be included to the list of words. The method overall returns a tuple of the list of words, target word, target part of speech,
##### the random index, and the list of likely words from the Word2Vec.
def generatePossibleWords(sent,W2Vmodel):
  randomInt = random.randint(1,len(sent)-2)
  target_POS = nltk.pos_tag(sent)[randomInt][1]
  target_w = sent[randomInt] ## use the try-except method
  try:
    likely_replace = W2Vmodel.wv.most_similar([target_w],topn=15)
  except:
    if(isinstance(W2Vmodel,gensim.models.KeyedVectors)==True): ## checks if the model is a keyed-vector
      likely_replace = W2Vmodel.most_similar([target_w],topn=15)
    else:
      likely_replace = [('Error',0.0)]
      print(f"Word in concern: {target_w}")
  list_words = list()
  for possible_word,similarity in likely_replace:
    pw_POS = nltk.pos_tag([possible_word])[0][1]
    if(pw_POS == target_POS):
      list_words.append(possible_word)
  return list_words,target_w,target_POS,randomInt,likely_replace

##print(sentence)

for i in range(1,100,10):
  sentence = nGramCorpus[i]
  if(len(sentence) > 3):
    print(nGramCorpus[i],generatePossibleWords(nGramCorpus[i],anotherModel))

print()

for i in range(1,100,10):
  s = nGramCorpus[i]
  if (len(s)>3):
    print(s,generatePossibleWords(s,corpusModel))

print()

for i in range(1,100,10):
  s = nGramCorpus[i]
  if(len(s)>3):
    print(s,generatePossibleWords(s,sonnetModel))

##print(generatePossibleWords(sentence,corpusModel))
##print(generatePossibleWords(nGramCorpus[14],corpusModel))
##print(f"Sentence: {nGramCorpus[14]}")

['<s>', 'and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring', '</s>'] (['this', 'another', 'both', 'all'], 'the', 'DT', 5, [('this', 0.8573122024536133), ('part', 0.8507951498031616), ('one', 0.8503074049949646), ('of', 0.8328992128372192), ('same', 0.8324545621871948), ('first', 0.8210473656654358), ('on', 0.8199756741523743), ('its', 0.8169469237327576), ('as', 0.8128418922424316), ('that', 0.8079659938812256), ('another', 0.8061550259590149), ('it', 0.804827094078064), ('both', 0.8008524775505066), ('time', 0.8005280494689941), ('all', 0.7978261709213257)])
['<s>', 'then', 'being', 'asked', 'where', 'all', 'thy', 'beauty', 'lies', '</s>'] (['when'], 'where', 'WRB', 4, [('.', 0.8364126682281494), ('now', 0.8304319977760315), ('there', 0.826165497303009), ('when', 0.8133443593978882), ('once', 0.8115141987800598), ('it', 0.8035557866096497), ('one', 0.8034142851829529), ('which', 0.8012702465057373), ('well', 0.7950435876846313), ('they', 0.7896511554718018), ('but', 0.788999199867

In [None]:
##### The first three lists contains conjunctions (mostly) and auxillary verbs. The auxilary verbs are used to examine if there is an inconsistency between the next word and the auxilary verb.
##### The stop words, on the other hand, are useful in not allowing the code to alter them. It is also helpful to prevent the code from altering the shakespearean pronouns
##### (if the downloaded Python library did not classify them as stop words).
stop_words = [ "a", "about", "above", "after", "again", "against", "all", "am", "an",
"and", "any", "are", "as", "at", "be", "because", "been", "before",
"being", "below", "between", "both", "but", "by", "could", "did", "do",
"does", "doing", "down", "during", "dost", "doth", "each", "few", "for", "from",
"further", "had", "has", "hast", "have", "having", "he", "he'd", "he'll",
"he's", "her", "here", "here's", "hers", "herself", "him", "himself",
"his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in",
"into", "is", "it", "it's", "its", "itself", "let's", "me", "more",
"most", "my", "myself", "nor", "of", "on", "once", "only", "or",
"other", "ought", "our", "ours", "ourselves", "out", "over", "own",
"same", "she", "she'd", "she'll", "she's", "should", "so", "some",
"such", "than", "that", "that's", "the", "thee", "their", "theirs", "them",
"themselves", "then", "there", "there's", "these", "they", "they'd",
"they'll", "they're", "they've", "thine", "this", "those", "thou", "through", "thy", "thyself", "to",
"too", "under", "until", "up", "very", "was", "we", "we'd", "we'll",
"we're", "we've", "were", "what", "what's", "when", "when's", "where",
"where's", "which", "while", "who", "who's", "whom", "why", "why's",
"with", "would", "ye", "you", "you'd", "you'll", "you're", "you've", "your",
"yours", "yourself", "yourselves" ]

modal_verbs = ["must","shall","will","should","would","can","could","may","might"]

primary_aux_verbs = ['have','be','do']

print(nltk.pos_tag(['ate']))
print(nltk.pos_tag(['meal']))
nltk.pos_tag(['i','ate','lunch','yesterday'])

##stanNLP.download('en_ewt')
nlp = spacy.load("en_core_web_sm")
##POSChecker = stanNLP.Pipeline()

##document = POSChecker("My home lies over the ocean.")

##document.sentences[0].print_dependencies()

##client = nlpcloud.Client("My home lies over the ocean.")
##client.sentence_dependencies

##grammarCheck = lang.LanguageTool("en-US",config={'maxTextLength':150,'cacheSize':1000,'pipelineCaching':True})
##sentenceChecker = GingerIt()

##sentenceChecker.parse("the brown fox chickens some eggs")

##def replace_certain_word():
  ##l = "l"

[('ate', 'NN')]
[('meal', 'NN')]


In [None]:
grammarCheck = lang.LanguageTool("en-US",config={'maxTextLength':150,'cacheSize':1000,'pipelineCaching':True})

Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:03<00:00, 71.8MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpxakecjbm.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to /root/.cache/language_tool_python.


In [None]:
gingerCheck = GingerIt()
corrections1 = gingerCheck.parse("you do not laughed")
print(corrections1)
corrections2 = gingerCheck.parse("hence you must sleep")
print(corrections2['corrections'][0])
###grammarCheck.correct("therefore commitment of perfect’st love being made")

{'text': 'you do not laughed', 'result': 'you do not laugh', 'corrections': [{'start': 4, 'text': 'do not laughed', 'correct': 'do not laugh', 'definition': None}]}
{'start': 0, 'text': 'hence you', 'correct': 'Hence, you', 'definition': 'Accept comma addition'}


In [None]:
##sentenceChecker.parse("The brown fox meal some chicken.")
nlp = spacy.load("en_core_web_sm")
results = nlp("The brown fox chicken some chicken.")
##for word in results:
  ##print(word,word.pos_,word.dep_,word.tag_)

##### Creating a word substitution method ########

##### This method will swap (or replace) the old word to the newer version of the word, which is randomly selected. It is recommended to use the list version of the document.
def word_swap(wordList,sentenceInConcern,trueIndex):
  rand_index = random.randint(0,len(wordList)-1)
  rand_word = wordList[rand_index]
  print(f'word in concern: {rand_word}\n')
  sentenceInConcern.pop(trueIndex)
  sentenceInConcern.insert(trueIndex,rand_word)


###### Using the spacy method ########
random.seed(1)

##### This determines the part of speech on a randomly-selected word from the sentence.
##### It returns the part of speech of the word based on context.
def insert_delete(wordArray,indexInConcern,word):
  ##print(f"Initial: {wordArray}")
  wordArray.pop(indexInConcern)
  ##print(f"With word removed: {wordArray}")
  wordArray.insert(indexInConcern,word)
  ##print(f"With new word inserted: {wordArray}")
  current_Pos = nlp(" ".join(wordArray))[indexInConcern].tag_   ##+1].tag_
  ##parsedSentence = nlp(" ".join(wordArray))
  ##for token in parsedSentence:
    ##print(f"Parsing the sentence : {token,token.tag_,parsedSentence}")
  return current_Pos

sentencesList = list()
#### Using the Shakespearean corpus ####
for index in range(1,100,10):
  testSentence = nGramCorpus[index]
  if (len(testSentence)!=3):
    sentence2 = testSentence[1:-1].copy()
    sentString = " ".join(sentence2)
    doc = nlp(sentString)
    docList = [str(w) for w in doc]
    stopWordCount = 0
    for token in doc:
      print(token,token.is_stop)
      if(token.is_stop or stop_words.count(str(token))!=0):
        stopWordCount+=1
    nonStopWordsCount = len(doc) - stopWordCount
    print(stopWordCount)
    wordsUsed = []
    for i in range(nonStopWordsCount):
      randInt = random.randint(0,len(doc)-1) ##1,len(testSentence)-2)
      wordInConcern = doc[randInt]        ##'sun'   ##testSentence[randInt]
      isStopWord = wordInConcern.is_stop  ####nlp(wordInConcern)[0].is_stop
      wordsUsed.append(wordInConcern)
      print(f"Debugging process: {wordInConcern,isStopWord,wordsUsed,sentence2}")
      if(isStopWord==True or wordsUsed.count(wordInConcern)>=2 or stop_words.count(str(wordInConcern))!=0):
        while(isStopWord==True or wordsUsed.count(wordInConcern)>=2 or stop_words.count(str(wordInConcern))!=0):
          randInt = random.randint(0,len(doc)-1)  ##1,len(testSentence)-2)
          wordInConcern = doc[randInt] ##testSentence[randInt]
          isStopWord = wordInConcern.is_stop  ##nlp(wordInConcern)[0].is_stop
          wordsUsed.append(wordInConcern)
          print(f"Random number, word in concern, is_stop_word: {randInt,wordInConcern,isStopWord,wordsUsed}")
      posInConcern = nlp(sentString)[randInt].tag_  ##randInt-1].tag_
      for i in range(len(nlp(sentString))):
        print(f"Result sentence:{nlp(sentString)[i].tag_,nlp(sentString)[i]}")
      print(f"Does this Word2Vec model have this word?  {corpusModel.wv.key_to_index.get(str(wordInConcern))!=None}")
      if(corpusModel.wv.key_to_index.get(str(wordInConcern))!=None):
        listOfPossibleWAndSim = corpusModel.wv.most_similar([str(wordInConcern)],topn=20)
        testList = list()
        for w,s in listOfPossibleWAndSim:
          currentPOS=insert_delete(docList,randInt,w) ##randInt,w)
          print(f"POS:{docList,currentPOS,w}")
          if (currentPOS == posInConcern):
            testList.append(w)
        print(posInConcern,wordInConcern)
        print(f"List 2: {testList}\n")
        if(len(testList)!=0):
          word_swap(testList,docList,randInt)   ##randInt-1)
          sentString = " ".join(docList)
    sentencesList.append(sentString)

print("\n\nResults")
for sentence in sentencesList:
  print(sentence)

and True
only True
herald False
to True
the True
gaudy False
spring False
4
Debugging process: (only, True, [only], ['and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring'])
Random number, word in concern, is_stop_word: (4, the, True, [only, the])
Random number, word in concern, is_stop_word: (6, spring, False, [only, the, spring])
Result sentence:('CC', and)
Result sentence:('RB', only)
Result sentence:('VB', herald)
Result sentence:('IN', to)
Result sentence:('DT', the)
Result sentence:('JJ', gaudy)
Result sentence:('NN', spring)
Does this Word2Vec model have this word?  True
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'top'], 'NN', 'top')
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'bowels'], 'NNS', 'bowels')
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'ashes'], 'NNS', 'ashes')
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'temples'], 'NNS', 'temples')
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'flame'], 'NN', 'flame')
POS:(['and', 'only'

In [None]:
##sentenceChecker.parse("The brown fox meal some chicken.")
nlp = spacy.load("en_core_web_sm")
results = nlp("The brown fox chicken some chicken.")
##for word in results:
  ##print(word,word.pos_,word.dep_,word.tag_)

##### Creating a word substitution method ########
##### Refer to the previous method.
def word_swap(wordList,sentenceInConcern,trueIndex):
  rand_index = random.randint(0,len(wordList)-1)
  rand_word = wordList[rand_index]
  print(f'word in concern: {rand_word}\n')
  sentenceInConcern.pop(trueIndex)
  sentenceInConcern.insert(trueIndex,rand_word)


###### Using the spacy method ########
random.seed(1)

##### Refer to the previous method.
def insert_delete(wordArray,indexInConcern,word):
  ##print(f"Initial: {wordArray}")
  wordArray.pop(indexInConcern)
  ##print(f"With word removed: {wordArray}")
  wordArray.insert(indexInConcern,word)
  ##print(f"With new word inserted: {wordArray}")
  current_Pos = nlp(" ".join(wordArray))[indexInConcern].tag_   ##+1].tag_
  return current_Pos

####Using the modern corpus model#######
sentencesList = list()

for index in range(1,100,10):
  testSentence = nGramCorpus[index]
  if (len(testSentence)!=3):
    sentence2 = testSentence[1:-1].copy()
    sentString = " ".join(sentence2)
    doc = nlp(sentString)
    docList = [str(w) for w in doc]
    stopWordCount = 0
    for token in doc:
      print(token,token.is_stop)
      if(token.is_stop or stop_words.count(str(token))!=0):
        stopWordCount+=1
    nonStopWordsCount = len(doc) - stopWordCount
    print(stopWordCount)
    wordsUsed = []
    for i in range(nonStopWordsCount):
      randInt = random.randint(0,len(doc)-1) ##1,len(testSentence)-2)
      wordInConcern = doc[randInt]        ##'sun'   ##testSentence[randInt]
      isStopWord = wordInConcern.is_stop  ####nlp(wordInConcern)[0].is_stop
      wordsUsed.append(wordInConcern)
      print(f"Debugging process: {wordInConcern,isStopWord,wordsUsed,sentence2}")
      if(isStopWord==True or wordsUsed.count(wordInConcern)>=2 or stop_words.count(str(wordInConcern))!=0):
        while(isStopWord==True or wordsUsed.count(wordInConcern)>=2 or stop_words.count(str(wordInConcern))!=0):
          randInt = random.randint(0,len(doc)-1)  ##1,len(testSentence)-2)
          wordInConcern = doc[randInt] ##testSentence[randInt]
          isStopWord = wordInConcern.is_stop  ##nlp(wordInConcern)[0].is_stop
          wordsUsed.append(wordInConcern)
          print(f"Random number, word in concern, is_stop_word: {randInt,wordInConcern,isStopWord,wordsUsed}")
      posInConcern = nlp(sentString)[randInt].tag_  ##randInt-1].tag_
      for i in range(len(nlp(sentString))):
        print(f"Result sentence:{nlp(sentString)[i].tag_,nlp(sentString)[i]}")
      print(f"Does this Word2Vec model have this word?  {anotherModel.key_to_index.get(str(wordInConcern))!=None}")
      if(anotherModel.key_to_index.get(str(wordInConcern))!=None):
        listOfPossibleWAndSim = anotherModel.most_similar([str(wordInConcern)],topn=20)
        testList = list()
        for w,s in listOfPossibleWAndSim:
          currentPOS= nlp(w)[0].tag_
          print(f"POS:{docList,posInConcern,currentPOS,w}")
          if (currentPOS == posInConcern):
            testList.append(w)
        print(posInConcern,wordInConcern)
        print(f"List 2: {testList}\n")
        if(len(testList)!=0):
          word_swap(testList,docList,randInt)   ##randInt-1)
          sentString = " ".join(docList)
    sentencesList.append(sentString)

print("\n\nResults")
for sentence in sentencesList:
  print(sentence)
sentencesList.clear()

and True
only True
herald False
to True
the True
gaudy False
spring False
4
Debugging process: (only, True, [only], ['and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring'])
Random number, word in concern, is_stop_word: (4, the, True, [only, the])
Random number, word in concern, is_stop_word: (6, spring, False, [only, the, spring])
Result sentence:('CC', and)
Result sentence:('RB', only)
Result sentence:('VB', herald)
Result sentence:('IN', to)
Result sentence:('DT', the)
Result sentence:('JJ', gaudy)
Result sentence:('NN', spring)
Does this Word2Vec model have this word?  True
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring'], 'NN', 'NN', 'summer')
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring'], 'NN', 'NN', 'autumn')
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring'], 'NN', 'NN', 'winter')
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring'], 'NN', 'VBG', 'beginning')
POS:(['and', 'only', 'herald', 'to', 'the', 'gaudy', 'spring'], 'N

In [None]:
##### This will determine the part of speech using the process of removing and inserting a word in a sentence.
##### The copy method is used to prevent any alteration to the original sentence. Moreover, removing and inserting the word helps to
##### better determine the part of speech of a particular word rather than using the word itself to determine the part of speech.
def determiningPOSThroughSentence(Sent,W_index,tempWord):
  Sent2 = Sent.copy()
  Sent2.pop(W_index)
  Sent2.insert(W_index,tempWord)
  record = nlp(" ".join(Sent2))
  ##print(f"Substituting Sentence: {Sent2}")
  ##for token in record:
    ##print(f"Parsing the sentences with Part of Speech : {token,token.tag_}")
  pos = record[W_index].tag_
  return pos

##### This converts a sentence in list form to the one in string form. If the sentence contains the apostrophe, the space behind the apostrophe is "removed".
def turningListIntoSentence(listToBeConverted):
  newSentence = " ".join(listToBeConverted)
  if newSentence.__contains__("'"):
    newSentence = newSentence.replace(" '", "'")
  return newSentence

##### This substitutes the inital word in the sentence with another word that contains similar part of speech. It also has the GingerIT to revise the sentence,
##### preventing the method from having the syntatically incorrect sentence. Furthermore, the GingerIT is more ideal to use than the language tool python module
##### due to the fact that the GingerIT does not require downloading while the language tool python does download the jar file.
def substitutingWords(listOfWords,Sent,word_i):
  r_index = random.randint(0,len(listOfWords)-1)
  r_word = listOfWords[r_index]
  forLemmaPreviousSentence = nlp(Sent[word_i-1])[0].lemma_
  r_pos = determiningPOSThroughSentence(Sent,word_i,r_word)
  print(f"\n\nBefore: {r_word,r_index}")
  if modal_verbs.__contains__(Sent[word_i-1]) and r_pos != "VB":
    while r_pos != "VB":
      r_index = random.randint(0,len(listOfWords)-1)
      r_word = listOfWords[r_index]
      r_pos = determiningPOSThroughSentence(Sent,word_i,r_word)
      print(f"\n\nAfter: {r_word,r_index}")
  if primary_aux_verbs.count(forLemmaPreviousSentence)==1:
    if forLemmaPreviousSentence=="is" and (r_pos != "VBG" or r_pos != "VBN"):
      while r_pos != "VBG" or r_pos != "VBN":
        r_index = random.randint(0,len(listOfWords)-1)
        r_word = listOfWords[r_index]
        r_pos = determiningPOSThroughSentence(Sent,word_i,r_word)
    if forLemmaPreviousSentence=="have" and r_pos != "VBN":
      while r_pos != "VBN":
        r_index = random.randint(0,len(listOfWords)-1)
        r_word = listOfWords[r_index]
        r_pos = determiningPOSThroughSentence(Sent,word_i,r_word)
    if forLemmaPreviousSentence=="do" and r_pos != "VB":
      while(r_pos != "VB"):
        r_index = random.randint(0,len(listOfWords)-1)
        r_word = listOfWords[r_index]
        r_pos = determiningPOSThroughSentence(Sent,word_i,r_word)
  print(f"\nWord choose: {r_word}\n")
  print(f"\nOriginal sentence : {Sent}\n")
  Sent.pop(word_i)
  Sent.insert(word_i,str(r_word))
  newSentence = turningListIntoSentence(Sent)
  ginger = GingerIt()
  corrections = ginger.parse(newSentence)
  if corrections['corrections'] != None and corrections['corrections'][0]['start']==4:
    ##correctSentence = nlp(corrections['result'])
    ##if(correctSentence[word_i].is_punct):
      ##correctWord = str(correctSentence[word_i+1])
    ##else:
      ##correctWord = str(correctSentence[word_i])
    ##Sent.pop(word_i) ; Sent.insert(word_i, correctWord)
  print(f"\nFinal sentence : {Sent}\n")

##### This method will allow the list to include the lines used. It avoids repeating the same lines from the corpus.
def addingUsedLines(maxIndex,lineInConcern,corpus,listUsed):
  randomIndex = random.randint(0,maxIndex)
  lineInConcern = corpus[randomIndex]
  listUsed.append(lineInConcern)

##### This returns true if either the current part of speech matches to the target one, the current and the target ones are both verbs,
##### or the target one is considered as a noun with the current one being a present participle. It will default to false if the target part of speech
##### and the current part of speech does not match each "case statements".
def posMatch(current,target):
  if current == target:
    return True
  if current[:2]=="VB" and target[:2]=="VB":
    return True
  if target == "NN" and current == "VBG":
    return True
  return False

##### Initally, the method is created to help create a document in the list format. The method also contain an if statement for the hyphen to
##### make the two words separated by the hyphen into one. However, it might be fine for the model to create a new word like what Shakespeare did.
##def createDocList(document):
  ##doclist = list()
  ##for i in document:
    ##if(str(i) == "-"):


random.seed("Romeo and Juliet")

listOfSentences = list()
sentencesUsed = list()
maxIndexForSonnets = nGramCorpus.index(['<s>','the','end','</s>'])

for i in range(5):
  randomSonnetIndex = random.randint(0,maxIndexForSonnets)
  currentLine = nGramCorpus[randomSonnetIndex]
  sentencesUsed.append(currentLine)
  print(f"\nCurrent Line: {currentLine}\n")
  print(f"\nSentence Used: {sentencesUsed}\n")
  if(sentencesUsed.count(currentLine)>=2):
    while(sentencesUsed.count(currentLine) >= 2):
      addingUsedLines(maxIndexForSonnets,currentLine,nGramCorpus,sentencesUsed)
  if(len(currentLine) != 3):
    newLine = currentLine[1:-1]
    targetSent = " ".join(newLine)
    print(f"The target sentence: {targetSent}")
  else:
    while(len(currentLine) == 3):
      randomIndex = random.randint(0,maxIndexForSonnets)
      currentLine = nGramCorpus[randomIndex]
      ##sentencesUsed.append(currentLine)
    newLine = currentLine[1:-1]
    targetSent = " ".join(newLine)
  document = nlp(targetSent)
  docList = [str(token) for token in document]
  wordsUsed = list()
  print(f"\nSentence in concern : {document}\n")
  stopWordcounter = 0
  for word in document:
    if(word.is_stop or stop_words.count(str(word))>=1):
      stopWordcounter+=1
  nonStopWordsCounter = len(document) - stopWordcounter
  for modification in range(nonStopWordsCounter):
    maxIndexForSentence = len(document)-1
    randWordIndex = random.randint(0,maxIndexForSentence)
    targetWord = document[randWordIndex]
    wordsUsed.append(targetWord)
    print(f"\nThe list of words used: {wordsUsed}\n")
    if(targetWord.is_stop or wordsUsed.count(targetWord)>=2 or stop_words.count(str(targetWord))>=1):
      while(targetWord.is_stop or wordsUsed.count(targetWord)>=2 or stop_words.count(str(targetWord))>=1):
        randWordIndex = random.randint(0,maxIndexForSentence) ##  the method does not work for this block of code
        targetWord = document[randWordIndex]
        wordsUsed.append(targetWord)
        print(f"\nAnother list of words used: {wordsUsed}\n")
    targetPOS = targetWord.tag_
    print(f"\nTarget Word and Part of Speech : {targetWord,targetPOS}\n")
    if(anotherModel.key_to_index.get(str(targetWord))!=None):
      possibleWords = anotherModel.most_similar([str(targetWord)])
      wordStorage = list()
      for word,similarity in possibleWords:
        currentPOS = determiningPOSThroughSentence(docList,randWordIndex,word)
        print(f"\n{docList,word,currentPOS}\n")
        ######## alternate way: currentPOS = nlp(word)[0].tag_
        if posMatch(currentPOS,targetPOS):
          wordStorage.append(word)
      if(len(wordStorage)!=0):
        ##print(f"Sentence: {docList}")
        substitutingWords(wordStorage,docList,randWordIndex)
        ##print(f"New Sentence: {docList}")
  listOfSentences.append(docList)
print("\n\n\nResults")
print(listOfSentences)


Current Line: ['<s>', 'what', 'merit', 'do', 'i', 'in', 'my', 'self', 'respect', '</s>']


Sentence Used: [['<s>', 'what', 'merit', 'do', 'i', 'in', 'my', 'self', 'respect', '</s>']]

The target sentence: what merit do i in my self respect

Sentence in concern : what merit do i in my self respect


The list of words used: [merit]


Target Word and Part of Speech : (merit, 'NN')


(['what', 'merit', 'do', 'i', 'in', 'my', 'self', 'respect'], 'awarded', 'VBD')


(['what', 'merit', 'do', 'i', 'in', 'my', 'self', 'respect'], 'excellence', 'NN')


(['what', 'merit', 'do', 'i', 'in', 'my', 'self', 'respect'], 'achievement', 'NN')


(['what', 'merit', 'do', 'i', 'in', 'my', 'self', 'respect'], 'scholarship', 'NN')


(['what', 'merit', 'do', 'i', 'in', 'my', 'self', 'respect'], 'award', 'NN')


(['what', 'merit', 'do', 'i', 'in', 'my', 'self', 'respect'], 'medal', 'NN')


(['what', 'merit', 'do', 'i', 'in', 'my', 'self', 'respect'], 'awarding', 'VBG')


(['what', 'merit', 'do', 'i', 'in', 'my

In [None]:
sentence = "you must sleep before midnight"
sentenceInDocumentForm = nlp(sentence)
sentenceInDocumentListForm = [str(word) for word in sentenceInDocumentForm]
index = 2 ; goalPOS = sentenceInDocumentForm[index].tag_
wordsAndSimilarities = anotherModel.most_similar(str(sentenceInDocumentForm[index]))
listOfWords = []
for word, similarity in wordsAndSimilarities:
  currentPOS = determiningPOSThroughSentence(sentenceInDocumentListForm,index,word)
  if posMatch(currentPOS,goalPOS):
    listOfWords.append(word)
##print(nlp("Are you sleeping")[2].tag_)
##print(nlp("She sleeps")[1].tag_)
random.seed(5)
substitutingWords(listOfWords,sentenceInDocumentListForm,index)

ginger = GingerIt()
corrections = ginger.parse(" ".join(sentenceInDocumentListForm))
if(corrections['corrections'] != None):
  correctDocument = nlp(corrections['result'])
  wordToReplace = correctDocument[index]
  sentenceInDocumentListForm.pop(2) ; sentenceInDocumentListForm.insert(2,wordToReplace)
print(sentenceInDocumentListForm)



Before: ('slept', 4)

Word choose: slept


Original sentence : ['you', 'must', 'sleep', 'before', 'midnight']


Final sentence : ['you', 'must', 'slept', 'before', 'midnight']

['you', 'must', sleep, 'before', 'midnight']


In [None]:
sentence = "Laura 's novels are amazing"
###if sentence.__contains__(" '"):
  ##sentence = sentence.replace(" '","'")
##print(sentence)
##docList = [token.text for token in nlp(sentence)]
##for i in docList:
  ##print(i,isinstance(i,str))


Laura True
's True
novels True
are True
amazing True


In [None]:
### This method classifies the part of speech of a particular word
### through the process of deleting the word that needs to be replaced and adding the new word to
### determine the part of speech.
def findingPartOfSpeech(docSentence,word,index):
  docSentence.pop(index)
  docSentence.insert(index,word)
  new_Sentence = " ".join(docSentence)
  new_document = nlp(new_Sentence)
  posTag= new_document[index].tag_
  return posTag

### This method will randomly pick a word from the word list and see if
### the word has the same part of speech like the specified one.
def evaluatePOS(sentence,wordList,partOfSpeech,rPOS,rWord,doc_index):
  while(rPOS != partOfSpeech):
    rIndex = random.randint(0,len(wordList)-1)
    rWord = wordList[rIndex]
    rPOS = findingPartOfSpeech(sentence,rWord,doc_index)
    print(f"Current random word : {rIndex,rWord,rPOS}")
  return rWord

### This method is similar to the previous method. However, the boolean statements are
### different to accomondate Python's inability to do polymorphism (same methods with different parameters).
def evaluatePOS2(sentence,wordList,partOfSpeech1,partOfSpeech2,rPOS,rWord,doc_index):
  while(rPOS != partOfSpeech1 or rPOS != partOfSpeech2):
    randIndex = random.randint(0, len(wordList)-1)
    rWord = wordList[randIndex]
    rPOS = findingPartOfSpeech(sentence,rWord,doc_index)
    print(f"Current random word : {randIndex,rWord,rPOS}")
  return rWord

### This is another similar model with the 'equals boolean' to determine if the particular part of speech
### is similar to the other part of speech specified.
def evaluatePOS3(sentence,wordList,partOfSpeech,rPOS,rWord,doc_index):
  while(rPOS==partOfSpeech):
    rIndex = random.randint(0,len(wordList)-1)
    rWord = wordList[rIndex]
    rPOS = findingPartOfSpeech(sentence,rWord,doc_index)
    print(f"Current random word : {rIndex,rWord,rPOS}")
  return rWord

### This method will convert the list object into a sentence (string). If the newly-converted sentence
### contains an apostrophe, it will merge the apostrophe to the previous word.
def convertListToSentence(wordList):
  convertedSentence = " ".join(wordList)
  if convertedSentence.__contains__("'"):
    convertedSentence = convertedSentence.replace(" '","'")
  return convertedSentence

### The method involves picking a random index. The random index is then used to obtain the random word.
### The lemma of the previous word helps to determine if the word (in verb form) is grammarically incorrect.
### If it is, the word will be replace by another one with accurate rendering.
### The GingerIt software helps to check if the sentence is grammarically correct if the pos-tagger fails to catagorize the part of speech.
### The recommendation for using the method is to make sure that the document in the list format is the Sent parameter.
def substitutingWords(listOfWords,Sent,doc_i):
  r_index = random.randint(0,len(listOfWords)-1)
  r_word = listOfWords[r_index]
  sentence2 = Sent.copy()
  document = nlp(" ".join(sentence2))
  documentList = [str(token) for token in document]
  if doc_i != 0 : forLemmaPreviousSentence = document[doc_i-1].lemma_
  r_pos = findingPartOfSpeech(documentList,r_word,doc_i)
  print(f"\nThe previous word (lemma): {forLemmaPreviousSentence}")
  print(f"\n\nBefore: {r_word,r_index,r_pos}")
  if modal_verbs.__contains__(documentList[doc_i-1]) and r_pos != "VB":
    r_word = evaluatePOS(documentList, listOfWords, "VB", r_pos, r_word, doc_i)
  if primary_aux_verbs.count(forLemmaPreviousSentence)==1:
    if forLemmaPreviousSentence=="is" and (r_pos != "VBG" or r_pos != "VBN"):
      r_word = evaluatePOS2(documentList,listOfWords,"VBG","VBN",r_word,doc_i)
    if forLemmaPreviousSentence=="have" and r_pos != "VBN":
      r_word = evaluatePOS(documentList,listOfWords,"VBN",r_pos,r_word,doc_i)
    if forLemmaPreviousSentence=="do" and r_pos != "VB":
      r_word = evaluatePOS(documentList, listOfWords, "VB", r_pos, r_word, doc_i)
  if document[doc_i-1].tag_[:2] == "NN" and r_pos == "VBG":
    r_word = evaluatePOS3(documentList,listOfWords,"VBG",r_pos,r_word,doc_i)
  print(f"\nWord choose: {r_word}\n")
  print(f"\nOriginal sentence : {Sent}\n")
  documentList.pop(doc_i)
  documentList.insert(doc_i,r_word)
  newSentence = convertListToSentence(documentList)
  ginger = GingerIt()
  corrections = ginger.parse(newSentence)
  if corrections['corrections'] != None:
    correctSentence = nlp(corrections['result'])
    if(correctSentence[doc_i].is_punct):
      wordToCorrect = str(correctSentence[doc_i+1])
    else:
      wordToCorrect = str(correctSentence[doc_i])
    documentList.pop(doc_i) ; documentList.insert(doc_i,wordToCorrect)
  newSentence = convertListToSentence(documentList)
  print(f"\nFinal sentence : {newSentence}\n")

random.seed(3)
particularSentence = "tracy's miniature sack must contain some candy"
particularSentence2 = [str(token) for token in nlp(particularSentence)]
listOFWORDS = ['damaged','ruined','ripped','rip','ruin','have']
doc_index = 5 ; previous_word = particularSentence2[doc_index-1]
word = particularSentence2[doc_index]
##print(anotherModel.most_similar(word))
previous_lemma = nlp(previous_word)[0].lemma_
word_pos = nlp(particularSentence2[doc_index])[0].tag_
substitutingWords(listOFWORDS,particularSentence2,doc_index)
##previous_lemma=="do" and word_pos != "VB"


The previous word (lemma): must


Before: ('ruined', 1, 'VBN')
Current random word : (4, 'ruin', 'VB')

Word choose: ruin


Original sentence : ['tracy', "'s", 'miniature', 'sack', 'must', 'contain', 'some', 'candy']


Final sentence : tracy's miniature sack must ruin some candy



In [None]:
testDocument1 = nlp("you must slept")
testDocument2 = nlp("you must tell a joke")

print(f"Test Document 1 : {testDocument1[2].tag_}")
print(f"Test Document 2 : {testDocument2[2].tag_}")

for i in testDocument2:
  print(isinstance(i,str))

Test Document 1 : VB
Test Document 2 : VB
False
False
False
False
False


In [None]:
x=['<s>',"larry's",'spaghetti','is','amazing','like','my',"grandma's",'sandwich','and',"sally's",'savory','tacos','</s>']
y=['<s>',"larry's",'spaghetti','</s>']
wordsInList = x[1:-1].copy()
sentence = " ".join(wordsInList)
##sentence = "before midnight, we played games."
nlp = spacy.load("en_core_web_sm")

doc = nlp(sentence)
word = doc[0]
counter = 0
list1 = list()
for w in doc:
  print(f"Word: {w,w.is_stop}")
  if w.is_stop:
    counter+=1
print(f"Number of non-stopwords: {len(doc)-counter} Counter: {counter}")

print(f"Is '-' a stop word: {nlp('-')[0].is_stop}")

print(f"{anotherModel.most_similar(['self-discipline'])}")

##print(f"{nltk.pos_tag(['then','beauteous','niggard','why','dost','thou','abuse'])}")

##print(anotherModel.key_to_index.get("th'vein")==None)

testDoc = nlp("then beauteous niggard why dost thou abuse")

testDoc[-1].tag_ ; l = list()

for i in range(2):
  l.append(y)
l.count(y)

array = []
steak = nlp("my fire-smoked steak")
for i in steak:
  array.append(i)
##if(array.__contains__(nlp("-"))):
  ##index = array.index("-")
  ##newWord = nlp("".join([]))

document = nlp("you have napped")
print(document[2].dep_)
#if(len(doc)>len(wordsInList)):
#  positionDifference = len(doc)-len(wordsInList)
#  index = 3
#  print(wordsInList[index])
#  print(doc[index+1])
#positionDifference
nlp("is controlled")[1].tag_

Word: (larry, False)
Word: ('s, True)
Word: (spaghetti, False)
Word: (is, True)
Word: (amazing, False)
Word: (like, False)
Word: (my, True)
Word: (grandma, False)
Word: ('s, True)
Word: (sandwich, False)
Word: (and, True)
Word: (sally, False)
Word: ('s, True)
Word: (savory, False)
Word: (tacos, False)
Number of non-stopwords: 9 Counter: 6
Is '-' a stop word: False
[('self-reliance', 0.7289815545082092), ('self-confidence', 0.7239317297935486), ('self-denial', 0.6930438280105591), ('self-control', 0.6915157437324524), ('self-restraint', 0.6889591217041016), ('innovativeness', 0.6723949313163757), ('self-knowledge', 0.6351141929626465), ('open-mindedness', 0.62314772605896), ('unselfishness', 0.6142915487289429), ('straightforwardness', 0.6129526495933533)]
ROOT


'VBN'

In [None]:
random.seed("sonnet")

##nltk.pos_tag(['laughter and tragedy'])[0]

sentence = ['<s>','the','brown','fox','ate','some','chicken','</s>']   ##nGramCorpus[5]
tempSentence = sentence[1:-1].copy()
print(nltk.pos_tag(tempSentence))

num = 3 ##random.randint(1,len(tempSentence)-2)
target_pos = nltk.pos_tag(tempSentence)[num][1]
word = tempSentence.pop(num)
list_of_possible_w = anotherModel.most_similar([word]) ##  It seems that the corpus model might be the most suitable model

array = list()

print("\nUsing the word-insert method to determine POS\n")

for w,s in list_of_possible_w:
  tempSentence.insert(num,w)
  current_pos = nltk.pos_tag(tempSentence)[num][1]
  print(nltk.pos_tag(tempSentence))
  tempSentence.pop(num)
  if (current_pos == target_pos):
    array.append(w)
    ##print(f"{current_pos},{w}")
tempSentence.insert(num,word) ##  making sure that the second sentence has the verb "ate"
print()

print(f"Word in concern: {word},{target_pos}")
print(f"Result array: {array}")
##print(f"List of likely words: {list_of_possible_w}")

##nltk.pos_tag(sentence)


######################################################
print("\nUsing only word to determine POS\n")
num2 = 3
tar_word = nltk.pos_tag(tempSentence)[num2][0]
tar_pos = nltk.pos_tag(tempSentence)[num2][1]
words_with_high_sim = anotherModel.most_similar([tar_word])
list_of_words = []
for word, similarity in words_with_high_sim:
  POS_in_concern = nltk.pos_tag([word])[0][1]
  print(f"{word} {POS_in_concern}")
  if(POS_in_concern==tar_pos):
    list_of_words.append(word)
print()
print(f"Result array: {list_of_words}")
print(f"{tar_word}, {tar_pos}")

[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('ate', 'VB'), ('some', 'DT'), ('chicken', 'NN')]

Using the word-insert method to determine POS

[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('eaten', 'VB'), ('some', 'DT'), ('chicken', 'NN')]
[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('eat', 'VB'), ('some', 'DT'), ('chicken', 'NN')]
[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('eating', 'VBG'), ('some', 'DT'), ('chicken', 'NN')]
[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('eats', 'VBZ'), ('some', 'DT'), ('chicken', 'NN')]
[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('cooked', 'VBD'), ('some', 'DT'), ('chicken', 'NN')]
[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('meal', 'VB'), ('some', 'DT'), ('chicken', 'NN')]
[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('drank', 'VBD'), ('some', 'DT'), ('chicken', 'NN')]
[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('hamburgers', 'NNS'), ('some', 'DT'), ('chicken', 'NN')]
[('the', 'DT'), ('brown', 'JJ'), ('fox', 'NN')

In [None]:
#### This method converts the n-gram into a text file in JSON format.
def toJson(fileName,nGram):   ## based on code of the json
  j = json.dumps(nGram,indent=2,sort_keys=True)
  with open(fileName,'w',encoding='UTF-8 SIG') as file:
    file.write(j)

toJson('trigramJSON.txt',trigram)

In [None]:
shakespeareModel = gensim.models.Word2Vec(nGramCorpus)

In [None]:
shakespeareVec = shakespeareModel.wv  ##  one participant in the stack overflow recommends using the keyed vectors to access the necessary methods to observe vectors
print(shakespeareVec.get_vector('king'))
shakespeareVec.most_similar(positive=['king','woman'],negative=['man']) ##  might need to improve the method of creating the model. Gives unrelated words.

[-1.4896965   0.1447495   0.0036604   0.25758362  0.49845004 -0.46043837
  0.21552297  0.20915282 -0.76130944 -0.1171189  -1.0549004  -1.7121533
 -1.3556584  -0.05107461 -0.72724795 -0.13814832  0.36822715  0.3621659
  0.3265515  -0.73169017  0.70435935 -0.07982887  1.5312544  -0.25616178
  0.7680597  -0.28350765 -1.0291958   1.1405867   0.59878516 -0.7911796
  0.77788746 -0.02804234  0.6727259  -0.8300943  -0.4491143   0.7938127
 -0.48172775 -0.60219496 -0.8509416  -0.8016355  -0.80425304 -0.12479208
 -0.7453309  -0.21951714  0.0509445   0.1836334  -1.0511628  -0.71461177
 -0.7396245  -0.02729492  0.41317728 -0.8457642  -1.1995511   0.50138617
 -0.3878732  -1.2224385   1.155316   -0.98782855 -0.442695    1.5800965
  0.0377332  -0.7573597   1.4008025  -1.3585383  -0.36837465  1.6977412
 -0.26921254  0.26928177  0.16641732 -0.9248975  -0.34953982  1.9365983
  0.4783118   0.6561674   1.0384973  -1.4027077   0.32118008 -0.12025905
 -0.25305822  0.16531774 -0.9188779   0.17844805 -0.193190

[('prince', 0.7901446223258972),
 ('queen', 0.748397946357727),
 ('duke', 0.7301698923110962),
 ('talbot', 0.716636598110199),
 ('gaunt', 0.6881637573242188),
 ('fifth', 0.6848716139793396),
 ('warwick', 0.6816003322601318),
 ('claudio', 0.6788616180419922),
 ('dauphin', 0.6657427549362183),
 ('plantagenet', 0.6627748012542725)]

In [None]:
##shakespeareVec.doesnt_match(['king','spirit','queen','nobility'])
##shakespeareVec.distance('caesar','emperor')
shakespeareVec.most_similar(positive=['greatness'])

[('affection', 0.9328632950782776),
 ('privilege', 0.9308606386184692),
 ('goodness', 0.9297807216644287),
 ('fate', 0.9297003746032715),
 ('folly', 0.9283994436264038),
 ('estate', 0.926902174949646),
 ('wound', 0.9251803755760193),
 ('entertainment', 0.9234305024147034),
 ('teeth', 0.9220489263534546),
 ('complexion', 0.9183086156845093)]

In [None]:
shakespeareVec.get_vector('king')

array([-1.4896965 ,  0.1447495 ,  0.0036604 ,  0.25758362,  0.49845004,
       -0.46043837,  0.21552297,  0.20915282, -0.76130944, -0.1171189 ,
       -1.0549004 , -1.7121533 , -1.3556584 , -0.05107461, -0.72724795,
       -0.13814832,  0.36822715,  0.3621659 ,  0.3265515 , -0.73169017,
        0.70435935, -0.07982887,  1.5312544 , -0.25616178,  0.7680597 ,
       -0.28350765, -1.0291958 ,  1.1405867 ,  0.59878516, -0.7911796 ,
        0.77788746, -0.02804234,  0.6727259 , -0.8300943 , -0.4491143 ,
        0.7938127 , -0.48172775, -0.60219496, -0.8509416 , -0.8016355 ,
       -0.80425304, -0.12479208, -0.7453309 , -0.21951714,  0.0509445 ,
        0.1836334 , -1.0511628 , -0.71461177, -0.7396245 , -0.02729492,
        0.41317728, -0.8457642 , -1.1995511 ,  0.50138617, -0.3878732 ,
       -1.2224385 ,  1.155316  , -0.98782855, -0.442695  ,  1.5800965 ,
        0.0377332 , -0.7573597 ,  1.4008025 , -1.3585383 , -0.36837465,
        1.6977412 , -0.26921254,  0.26928177,  0.16641732, -0.92