# DIGS 20006/30006 : NLP Assignment 5

I have chosen to do this assignment in English, and the text I am using here is 'The Count of Monte Cristo' By Alexandre Dumas.

In [1]:
import nltk
from nltk.corpus import words
from nltk.corpus import wordnet 
import pandas as pd
from nltk.chunk import conlltags2tree, tree2conlltags
from collections import Counter
from pprint import pprint

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline  

In [3]:
import spacy
from spacy import displacy

In [4]:
with open("MC.txt") as f:
    content = f.read().splitlines()
    content = list(filter(None, content))
    book = " ".join(content)

In [5]:
text=book[:1000000]

In [52]:
book[:1000]

'Chapter 1. Marseilles—The Arrival On the 24th of February, 1815, the look-out at Notre-Dame de la Garde signalled the three-master, the Pharaon from Smyrna, Trieste, and Naples. As usual, a pilot put off immediately, and rounding the Château d’If, got on board the vessel between Cape Morgiou and Rion island. Immediately, and according to custom, the ramparts of Fort Saint-Jean were covered with spectators; it is always an event at Marseilles for a ship to come into port, especially when this ship, like the Pharaon, has been built, rigged, and laden at the old Phocee docks, and belongs to an owner of the city. The ship drew on and had safely passed the strait, which some volcanic shock has made between the Calasareigne and Jaros islands; had doubled Pomègue, and approached the harbor under topsails, jib, and spanker, but so slowly and sedately that the idlers, with that instinct which is the forerunner of evil, asked one another what misfortune could have happened on board. However, th

In [54]:
len(text)

1000000

In [7]:
#NLTK
tok = nltk.word_tokenize(text)
pos = nltk.pos_tag(tok)
posdf = pd.DataFrame(pos)
print('The num of NLTK tokens is : %d' % len(tok))

The num of NLTK tokens is : 216765


In [8]:
chunks = nltk.ne_chunk(pos)  # chunks is a Tree
pprint(chunks[:50])

[('Chapter', 'NN'),
 ('1', 'CD'),
 ('.', '.'),
 ('Marseilles—The', 'NNP'),
 ('Arrival', 'NNP'),
 ('On', 'IN'),
 ('the', 'DT'),
 ('24th', 'CD'),
 ('of', 'IN'),
 ('February', 'NNP'),
 (',', ','),
 ('1815', 'CD'),
 (',', ','),
 ('the', 'DT'),
 ('look-out', 'NN'),
 ('at', 'IN'),
 Tree('ORGANIZATION', [('Notre-Dame', 'NNP')]),
 ('de', 'NNP'),
 ('la', 'FW'),
 Tree('PERSON', [('Garde', 'NNP')]),
 ('signalled', 'VBD'),
 ('the', 'DT'),
 ('three-master', 'NN'),
 (',', ','),
 ('the', 'DT'),
 ('Pharaon', 'NNP'),
 ('from', 'IN'),
 Tree('GPE', [('Smyrna', 'NNP')]),
 (',', ','),
 Tree('PERSON', [('Trieste', 'NNP')]),
 (',', ','),
 ('and', 'CC'),
 Tree('GPE', [('Naples', 'NNP')]),
 ('.', '.'),
 ('As', 'IN'),
 ('usual', 'JJ'),
 (',', ','),
 ('a', 'DT'),
 ('pilot', 'NN'),
 ('put', 'VBD'),
 ('off', 'RP'),
 ('immediately', 'RB'),
 (',', ','),
 ('and', 'CC'),
 ('rounding', 'VBG'),
 ('the', 'DT'),
 Tree('ORGANIZATION', [('Château', 'NNP')]),
 ('d', 'NN'),
 ('’', 'NN'),
 ('If', 'IN')]


In [10]:
nltk_labels = []
for chunk in chunks:
    if hasattr(chunk, 'label'):
        nltk_labels.append(chunk.label())
pprint(Counter(nltk_labels))

Counter({'PERSON': 3993,
         'GPE': 1741,
         'ORGANIZATION': 941,
         'FACILITY': 25,
         'LOCATION': 8,
         'GSP': 3})


In [8]:
#spaCy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
print('The num of spaCy tokens is %d' % len(doc))

The num of spaCy tokens is 220898


In [12]:
spacy_labels = [x.label_ for x in doc.ents]
pprint(Counter(spacy_labels))

Counter({'PERSON': 2457,
         'NORP': 1458,
         'GPE': 1422,
         'ORG': 1250,
         'CARDINAL': 1104,
         'DATE': 706,
         'TIME': 443,
         'ORDINAL': 397,
         'FAC': 359,
         'LOC': 210,
         'WORK_OF_ART': 133,
         'PRODUCT': 121,
         'QUANTITY': 100,
         'MONEY': 74,
         'LANGUAGE': 38,
         'LAW': 31,
         'EVENT': 17})


## 2. Examples

In [14]:
nltk_ex = {}
for chunk in chunks:
    if hasattr(chunk, 'label'):
        if chunk.label() not in nltk_ex:
            nltk_ex[chunk.label()] = ' '.join(c[0] for c in chunk)

In [27]:
pd.DataFrame.from_dict(nltk_ex, orient='index',columns=["Example"])

Unnamed: 0,Example
ORGANIZATION,Notre-Dame
PERSON,Garde
GPE,Smyrna
FACILITY,La Canebière
LOCATION,West
GSP,Turkish


In [20]:
spacy_ex = {}
for ent in doc.ents:
    if ent.label_ not in spacy_ex:
        spacy_ex[ent.label_] = ent.text

In [26]:
pd.DataFrame.from_dict(spacy_ex, orient='index', columns=["Example"])

Unnamed: 0,Example
LAW,Chapter 1
DATE,"the 24th of February, 1815"
FAC,Notre-Dame de la Garde
CARDINAL,three
GPE,Pharaon
ORG,Trieste
PERSON,Cape Morgiou
PRODUCT,Marseilles
NORP,
QUANTITY,0023m


## 3. Find the 2 longest sentences in your text and provide visual "trees"

In [65]:
#NLTK longest sentence
from nltk.tokenize import sent_tokenize, word_tokenize
sents = nltk.sent_tokenize(text)

In [80]:
n1 = max(sents, key=len)

In [81]:
displacy.render(nlp(str(n1)), jupyter=True, style='ent')

In [82]:
displacy.render(nlp(str(n1)), style='dep', jupyter = True, options = {'distance': 50})

In [101]:
n2 = sorted(sents, key=len)[-2]
n2

'“Well, excellency,” said the landlord triumphantly, and without waiting for Franz to question him, “I feared yesterday, when I would not promise you anything, that you were too late—there is not a single carriage to be had—that is, for the three last days” “Yes,” returned Franz, “for the very three days it is most needed.” “What is the matter?” said Albert, entering; “no carriage to be had?” “Just so,” returned Franz, “you have guessed it.” “Well, your Eternal City is a nice sort of place.” “That is to say, excellency,” replied Pastrini, who was desirous of keeping up the dignity of the capital of the Christian world in the eyes of his guest, “that there are no carriages to be had from Sunday to Tuesday evening, but from now till Sunday you can have fifty if you please.” “Ah, that is something,” said Albert; “today is Thursday, and who knows what may arrive between this and Sunday?” “Ten or twelve thousand travellers will arrive,” replied Franz, “which will make it still more difficul

In [102]:
displacy.render(nlp(str(n2)), jupyter=True, style='ent')

In [103]:
displacy.render(nlp(str(n2)), style='dep', jupyter = True, options = {'distance': 50})

In [9]:
#spaCy 2 longest sentence example
sentences = [x for x in doc.sents]
s1 = max(sentences, key=len)
s1

Without giving himself time to reconsider his decision, and, indeed, that he might not allow his thoughts to be distracted from his desperate resolution, he bent over the appalling shroud, opened it with the knife which Faria had made, drew the corpse from the sack, and bore it along the tunnel to his own chamber, laid it on his couch, tied around its head the rag he wore at night around his own, covered it with his counterpane, once again kissed the ice-cold brow, and tried vainly to close the resisting eyes, which glared horribly, turned the head towards the wall, so that the jailer might, when he brought the evening meal, believe that he was asleep, as was his frequent custom; entered the tunnel again, drew the bed against the wall, returned to the other cell, took from the hiding-place the needle and thread, flung off his rags, that they might feel only naked flesh beneath the coarse canvas, and getting inside the sack, placed himself in the posture in which the dead body had been 

In [85]:
displacy.render(nlp(str(s1)), jupyter=True, style='ent')

In [97]:
displacy.render(nlp(str(s1)), style='dep', jupyter = True, options = {'distance': 45})

In [91]:
s2 = sorted(sentences, key=len)[-2]
s2

The salon was filled with the works of modern artists; there were landscapes by Dupré, with their long reeds and tall trees, their lowing oxen and marvellous skies; Delacroix’s Arabian cavaliers, with their long white burnouses, their shining belts, their damasked arms, their horses, who tore each other with their teeth while their riders contended fiercely with their maces; aquarelles of Boulanger, representing Notre Dame de Paris with that vigor that makes the artist the rival of the poet; there were paintings by Diaz, who makes his flowers more beautiful than flowers, his suns more brilliant than the sun; designs by Decamp, as vividly colored as those of Salvator Rosa, but more poetic; pastels by Giraud and Müller, representing children like angels and women with the features of a virgin; sketches torn from the album of Dauzats’ “Travels in the East,” that had been made in a few seconds on the saddle of a camel, or beneath the dome of a mosque—in a word, all that modern art can give

In [93]:
displacy.render(nlp(str(s2)), jupyter=True, style='ent')

In [104]:
displacy.render(nlp(str(s2)), style='dep', jupyter = True, options = {'distance': 45})

## 4. Examples of differences for the “same” NLTK and SpaCy NER tags

In [106]:
diff = {}
for chunk in chunks:
    if hasattr(chunk, 'label'):
        for ent in doc.ents:
            if ent.text not in diff and ' '.join(c[0] for c in chunk) == ent.text and chunk.label() != ent.label_:  
                diff[ent.text] = (chunk.label(), ent.label_)

In [108]:
pd.DataFrame.from_dict(diff, orient='index', columns=["nltk", "spacy"])

Unnamed: 0,nltk,spacy
Smyrna,GPE,ORG
Trieste,PERSON,ORG
Château,ORGANIZATION,ORG
Marseilles,ORGANIZATION,PRODUCT
Pharaon,ORGANIZATION,GPE
Pomègue,PERSON,GPE
La Réserve,PERSON,FAC
Dantès,PERSON,FAC
Captain,PERSON,GPE
Naples,PERSON,GPE


## 5. Thoughts on the NER tags in both NLTK and SpaCy 

I think in general, spaCy performs better than NLTK. Not only does spaCy has more entity types, but also it has a built-in displaCy visualizer, which can show a sentences' dependencies nicely. Both nltk and spaCy make a lot of mistakes in NER, for example, nltk correctly decects Dantès as PERSON, while spaCy classifies Dantès as FAC. What's more, nltk thinks Himalaya is ORGANIZATION, and spaCy classifies	it as GPE. As for the sentence tokenizing part, I feel spaCy did a much better job than nltk.

## 6. Sentiment Analysis using TextBlob

In [6]:
import textblob
from textblob import TextBlob

In [17]:
blob = TextBlob(str(s1))
blob.sentiment

Sentiment(polarity=-0.08863636363636364, subjectivity=0.7159090909090909)

In [18]:
print(blob.sentiment_assessments)
print("\n")
pprint(blob.sentiment_assessments[2])

Sentiment(polarity=-0.08863636363636364, subjectivity=0.7159090909090909, assessments=[(['desperate'], -0.6, 1.0, None), (['appalling'], -0.35, 0.9, None), (['own'], 0.6, 1.0, None), (['own'], 0.6, 1.0, None), (['horribly'], -1.0, 1.0, None), (['frequent'], 0.1, 0.3, None), (['other'], -0.125, 0.375, None), (['only'], 0.0, 1.0, None), (['naked'], 0.0, 0.4, None), (['coarse'], 0.0, 0.5, None), (['dead'], -0.2, 0.4, None)])


[(['desperate'], -0.6, 1.0, None),
 (['appalling'], -0.35, 0.9, None),
 (['own'], 0.6, 1.0, None),
 (['own'], 0.6, 1.0, None),
 (['horribly'], -1.0, 1.0, None),
 (['frequent'], 0.1, 0.3, None),
 (['other'], -0.125, 0.375, None),
 (['only'], 0.0, 1.0, None),
 (['naked'], 0.0, 0.4, None),
 (['coarse'], 0.0, 0.5, None),
 (['dead'], -0.2, 0.4, None)]
