In [1]:
import scipy
import numpy as np
import json

import spacy
import tensorflow
import keras
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
  import Features, KeywordsOptions, EntitiesOptions, SemanticRolesOptions

import nltk
import string

# from __future__ import absolute_import
# from __future__ import division, print_function, unicode_literals

# from sumy.parsers.html import HtmlParser
# from sumy.parsers.plaintext import PlaintextParser
# from sumy.nlp.tokenizers import Tokenizer
# from sumy.summarizers.lsa import LsaSummarizer as Summarizer
# from sumy.nlp.stemmers import Stemmer
# from sumy.utils import get_stop_words

import inflection
import language_check
nlp = spacy.load('en_core_web_lg')

#nltk.download('punkt')
#nltk.download('wordnet')

Using TensorFlow backend.


In [2]:
#Load credentials from file (Store credentials in json format)
with open('credentials.json') as f:
    data = json.load(f)
url = data["url"]
username = data["username"]
password = data["password"]

In [3]:
# set grammar checker
grammar_tool = language_check.LanguageTool('en-US')

In [4]:
natural_language_understanding = NaturalLanguageUnderstandingV1(
  username=username,
  password=password,
  version='2018-03-16')

response = natural_language_understanding.analyze(
    url='https://www.nytimes.com/2018/07/16/opinion/trump-and-putin-vs-america.html?action=click&pgtype=Homepage&clickSource=story-heading&module=opinion-c-col-left-region&region=opinion-c-col-left-region&WT.nav=opinion-c-col-left-region',
    language='en',
    features=Features(
        keywords=KeywordsOptions(
            sentiment=False,
            emotion=False,
            limit=20),
    entities=EntitiesOptions(
        sentiment=False,
            emotion=False,
            limit=50),
    semantic_roles=SemanticRolesOptions()
  ))
entities = response['entities']
keywords = response['keywords']
semantic = response['semantic_roles']

In [5]:
# Question 1
# Extract keywords and entities
# define type of words
# create questions
def Q1(x):
    return {
        'Person': "Who is ",
        'Location': "Where is "
    }.get(x, "What is ")

with open("Questions1.txt", "w") as file:
    for en in entities:
        text = Q1(en['type']) + en['text'] + "?"
        matches = grammar_tool.check(text)
        correct_text = language_check.correct(text, matches)
        file.write("%s\n" % correct_text)

In [6]:
#Question 2
# Extract keywords and entities
# define plurality of keywords using nltk
# create question What are? Who are? 
# TODO. How to determine plurality of word?
# TODO. If condition does not suffice, what else?

In [7]:
# Question 4
# Why (factual statement)?
# TODO. How to determine which sentences are important enough to negate? TEXTSUM, sumy NOTWORKING

# Question 5
# What if (negated statements)?
# TODO. How to negate sentences? add not, find antonym
# TODO. How to determine which sentences are important enough to negate? summarize? gensim, pyteaser, pytextrank, TEXTSUM, sumy

Q4 = []
Q5 = []

listOfPlurals = ["they", "some", "most", "we"]
translator = str.maketrans('', '', string.punctuation) #strip punctuations

for sentence in semantic:
    if len(sentence) == 4:
        verb = sentence['action']['normalized'] #strip punctuations
        subj = sentence['subject']['text'].capitalize()
        obj = sentence['object']['text']
        if verb is not "s" and verb != "be":
            plurality = subj is not inflection.singularize(subj) or subj in listOfPlurals
            if plurality:
                Q4.append("Why do " + subj + " " + verb + " " + obj + "?")
            else:
                Q4.append("Why does " + subj + " " + verb + " " + obj + "?")
            Q5.append("What if " + subj + " did not " + verb + " " + obj + "?")
        elif verb == "be":
            plurality = subj is not inflection.singularize(subj) or subj in listOfPlurals
            if plurality:
                Q4.append("Why are " + subj + " " + obj + "?")
                Q5.append("What if " + subj + " were not " + obj + "?")
            else:
                Q4.append("Why is " + subj + " " + obj + "?")
                Q5.append("What if " + subj + " was not " + obj + "?")
        
with open("Questions4.txt", "w", encoding='UTF-8') as file:
    for q in Q4:
        matches = grammar_tool.check(q)
        corrected_q = language_check.correct(q, matches)
        file.write("%s\n" % corrected_q)
with open("Questions5.txt", "w", encoding='UTF-8') as file:
    for q in Q5:
        matches = grammar_tool.check(q)
        corrected_q = language_check.correct(q, matches)
        file.write("%s\n" % corrected_q)

# TODO: Check relevance to text
# TODO: lets do grammar check first and delete or fix the grammar before comparing relevance to text



In [8]:
# make sure to use larger model!
exceptList = ['DATE', 'MONEY', 'TIME', 'ORDINAL', 'QUANTITY', 'CARDINAL', 'PERCENT']
with open('article.txt', 'r') as file:
    textQ3 = file.readlines()

entities_seen = set()
with open('named_entities.txt', 'w') as file:
    for line in textQ3:
        doc = nlp(line)
        for ent in doc.ents:
            if ent.label_ not in exceptList:
                if ent.text.title() not in entities_seen: # not a duplicate
                    file.write(str(ent.text.title()) + '\n')
                    entities_seen.add(ent.text.title())


In [9]:
keyString = ""
for key in keywords:
    keyString += key['text'].title()
keytokens = nlp(keyString)

with open('named_entities.txt', 'r') as file:
    tokens = nlp(file.read())

In [10]:
dup = set()
with open("Questions3.txt", "w") as file:
    for key in keytokens:
        for token in tokens:
            tokenSimilarity = token.similarity(key)
            if tokenSimilarity > 0.6 and tokenSimilarity != 1 and token.text not in dup and len(token.text) > 2:
                file.write("What is the difference between %s and %s?\n" % (token.text, key.text))
                dup.add(token.text)