In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re
import stanza
stanza.download('en')
stanza_nlp = stanza.Pipeline('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-05-12 11:38:04 INFO: Downloading default packages for language: en (English)...
2022-05-12 11:38:05 INFO: File exists: /Users/user/stanza_resources/en/default.zip
2022-05-12 11:38:08 INFO: Finished downloading models and saved to /Users/user/stanza_resources.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-05-12 11:38:09 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2022-05-12 11:38:09 INFO: Use device: cpu
2022-05-12 11:38:09 INFO: Loading: tokenize
2022-05-12 11:38:09 INFO: Loading: pos
2022-05-12 11:38:09 INFO: Loading: lemma
2022-05-12 11:38:09 INFO: Loading: depparse
2022-05-12 11:38:09 INFO: Loading: sentiment
2022-05-12 11:38:10 INFO: Loading: constituency
2022-05-12 11:38:10 INFO: Loading: ner
2022-05-12 11:38:10 INFO: Done loading processors!


In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
from textblob import TextBlob
txt = "The Sound Quality is great but the battery life is very bad, and also what is with the shitty phone size.".lower()
wiki = TextBlob(txt)

In [4]:
def find_noun_phrases_boundaries(pos):
    results = []
    end = -1
    for i in range(len(pos)):
        if pos[i] != 'NN':
            if i != end + 1:
                results.append((end + 1, i))                
            end = i
    if i != end + 1:
        results.append((end + 1, i + 1))                
    return results

In [5]:
tags = [tag[1] for tag in wiki.pos_tags]
noun_phrases_boundaries = find_noun_phrases_boundaries(tags)
noun_phrases = [wiki.words[i:j] for i, j in noun_phrases_boundaries]
noun_phrases

[WordList(['sound', 'quality']),
 WordList(['battery', 'life']),
 WordList(['phone', 'size'])]

In [6]:
for noun_phrase in noun_phrases:
    wiki = TextBlob(re.sub(' '.join(noun_phrase), ''.join(noun_phrase), wiki.string))
    
print(wiki)

the soundquality is great but the batterylife is very bad, and also what is with the shitty phonesize.


In [7]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(wiki.string)

In [8]:
selected_tokens = []
for token in doc:
    if token.pos_ in ['NOUN']:
        selected_tokens.append(token.text)
selected_tokens

['soundquality', 'batterylife', 'phonesize']

In [12]:
spacy.explain('cop')

'copula'

In [9]:
stanza_doc = stanza_nlp(wiki.string)
deps = []
relations_of_interest = ['nsubj', 'amod']
for dep in stanza_doc.sentences[0].dependencies:
    print(dep[0], dep[1], dep[2])
    print('*' * 10)
    if dep[1] in relations_of_interest:
        if dep[0].text in selected_tokens:
            deps.append(((dep[0].text, dep[0].xpos), (dep[2].text, dep[2].xpos)))
        else:
            deps.append(((dep[2].text, dep[2].xpos), (dep[0].text, dep[0].xpos)))

{
  "id": 2,
  "text": "soundquality",
  "lemma": "soundquality",
  "upos": "NOUN",
  "xpos": "NN",
  "feats": "Number=Sing",
  "head": 4,
  "deprel": "nsubj",
  "start_char": 4,
  "end_char": 16
} det {
  "id": 1,
  "text": "the",
  "lemma": "the",
  "upos": "DET",
  "xpos": "DT",
  "feats": "Definite=Def|PronType=Art",
  "head": 2,
  "deprel": "det",
  "start_char": 0,
  "end_char": 3
}
**********
{
  "id": 4,
  "text": "great",
  "lemma": "great",
  "upos": "ADJ",
  "xpos": "JJ",
  "feats": "Degree=Pos",
  "head": 0,
  "deprel": "root",
  "start_char": 20,
  "end_char": 25
} nsubj {
  "id": 2,
  "text": "soundquality",
  "lemma": "soundquality",
  "upos": "NOUN",
  "xpos": "NN",
  "feats": "Number=Sing",
  "head": 4,
  "deprel": "nsubj",
  "start_char": 4,
  "end_char": 16
}
**********
{
  "id": 4,
  "text": "great",
  "lemma": "great",
  "upos": "ADJ",
  "xpos": "JJ",
  "feats": "Degree=Pos",
  "head": 0,
  "deprel": "root",
  "start_char": 20,
  "end_char": 25
} cop {
  "id": 3,
 

In [10]:
deps

[(('soundquality', 'NN'), ('great', 'JJ')),
 (('batterylife', 'NN'), ('bad', 'JJ')),
 (('phonesize', 'NN'), ('shitty', 'JJ'))]

In [11]:
displacy.render(doc, style='dep')