# DIGS 20006/30006 : NLP Assignment 2

In [1]:
import nltk
from nltk.book import *
from nltk.corpus import words
from nltk.corpus import wordnet 

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
import matplotlib.pyplot as plt
%matplotlib inline  

In [3]:
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    return sorted(unusual)

def unusual_words_many(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    manywords = nltk.corpus.words.words() + list(nltk.corpus.wordnet.words())
    english_vocab = set(w.lower() for w in manywords)
    unusual = text_vocab - english_vocab
    return sorted(unusual)

def unusual_words_many_no_endings_s_ed_ing(text):
    ut1 = unusual_words(text)
    print(str(len(ut1))+" words not in words.words\n")
    ut2 = unusual_words_many(text)
    print(str(len(ut2))+" words not in words.words + wordnet.words\n")
    ut3 = sorted(set(w.lower() for w in ut2 if not w.endswith("s") and not w.endswith("ed") and not w.endswith("ing")))
    print(str(len(ut3))+" words not in words.words + wordnet.words + no -s, -ed and -ing :\n")
    print(ut3)
    # -en plurals, -er and -est still not uncommon

## Q1

I have chosen 'The Count of Monte Cristo' By Alexandre Dumas and cause it is my favourite western novel. I only limited pre-processing (taking out the contents and the footnotes) since the text I downloaded from gutenberg.org was pretty clean.

In [4]:
with open("MC.txt") as f:
    content = f.read().splitlines()
    #content = list(filter(None, content))
    raw = " ".join(content)
    text = [word.lower() for word in raw.split() ]
    text = [word.rstrip() for word in text]

In [5]:
text[:10]

['chapter',
 '1.',
 'marseilles—the',
 'arrival',
 'on',
 'the',
 '24th',
 'of',
 'february,',
 '1815,']

In [6]:
len(text)

460025

## Q2

In [7]:
print("The raw length of my text is %d" % len(text))

The raw length of my text is 460025


## Q3

In [8]:
print("The number of tokens in my text is %d" % len(set(text)))

The number of tokens in my text is 38076


## Q4 

In [9]:
def lex_score(t):
    return len(set(t))/len(t)

print("The lexical diversity score of my text is %f" % lex_score(text))

The lexical diversity score of my text is 0.082769


The lexical diversity score of text2 'Sense and Sensibility' is 0.048263, while the lexical diversity score of 'The Count of Monte Cristo' is 0.082769. It appears that Alexandre Dumas' use of tokens is almost twice as rich as Jane Austen. My speculation is that those two authors are from different countries and have different genders. Also, 'The Count of Monte Cristo' is originally written in French, my text used the English version of it. I am not sure whether the translation affects the lexical diversity.

## Q5 Remove all punctuation 

In [10]:
import re
clean =[]
for s in text:
    s = re.sub(r'[^\w\s\d]',' ',s) 
    s = s.rstrip()
    s = s.lstrip()
    clean.append(s)

In [11]:
clean[:10]

['chapter',
 '1',
 'marseilles the',
 'arrival',
 'on',
 'the',
 '24th',
 'of',
 'february',
 '1815']

In [12]:
print("The raw length of my text without punctuation is %d" % len(clean))
print("The number of tokens in my text without punctuation is %d" % len(set(clean)))
print("The lexical diversity score of my text without punctuation is %f" % lex_score(clean))

The raw length of my text without punctuation is 460025
The number of tokens in my text without punctuation is 19391
The lexical diversity score of my text without punctuation is 0.042152


## Q6 Remove stopwords

In [13]:
stopwords = nltk.corpus.stopwords.words('english')
nonstop = [word for word in clean if word not in stopwords]

In [14]:
print("The raw length of my text without stopwords is %d" % len(nonstop))
print("The number of tokens in my text without stopwords is %d" % len(set(nonstop)))
print("The lexical diversity score of my text without stopwords is %f" % lex_score(nonstop))

The raw length of my text without stopwords is 220468
The number of tokens in my text without stopwords is 19258
The lexical diversity score of my text without stopwords is 0.087351


## Q7 Comparisons

Removing all the punctuation and stopwords decreased the length of the text and the number of tokens in the text greatly. What I felt interesting is after removing punctuation, the lexical diversity score decreased from 0.0827 to 0.065; However, the lexical diversity score increased to 0.1221 after removing stopwords. The stopwords count greatly as the raw length but not unique tokens.

## Q8 Words with lengths greater than 12 letters

In [15]:
LongWords = [word.rstrip() for word in nonstop if len(word.rstrip())>12]
LongWords = set(LongWords)

In [16]:
LongWords

{'dressing room i',
 'fortune first rate',
 'fontenay aux roses',
 'infallibility',
 'boat  two  three',
 'inconvenienced',
 'hassen ben sabah',
 'parliamentary',
 'benefactor the',
 'condescending',
 'pocket handkerchief',
 'advantage  that',
 'everyone many',
 'well informed',
 'appearance we',
 'calculating by',
 'manufacturing',
 'villefort see',
 'chimney piece',
 'congestion nothing',
 'death because',
 'lemonade that s',
 'half official',
 'stiff  having',
 'excellency  said',
 'mother in law s',
 'evinced  none',
 'wedding present',
 'julie emmanuel',
 'double barreled',
 'philanthropist',
 'majestic that',
 'unconsciousness',
 'superstitious',
 'valentine  permit',
 'notwithstanding',
 'mourning coaches',
 'alphabets one',
 'corridors instead',
 'countess nothing',
 'occurred what',
 'unaccountable',
 'shepherdesses',
 'floor polisher',
 'cypress trees',
 'whithersoever',
 'millionaire his',
 'd avrigny  her',
 'grief stricken',
 'half supporting',
 'all   giovanni',
 'inconce

## Q9 Unusual Words

In [17]:
unusual = unusual_words_many_no_endings_s_ed_ing(nonstop)

5328 words not in words.words

4373 words not in words.words + wordnet.words

833 words not in words.words + wordnet.words + no -s, -ed and -ing :

['abbé', 'abruzzo', 'abélard', 'académie', 'actæon', 'adelmonte', 'admirari', 'affettatore', 'aguado', 'alatri', 'albanese', 'albano', 'aleria', 'alicante', 'alla', 'allée', 'almo', 'amasine', 'amwaiter', 'amélie', 'anagni', 'anglo', 'anguille', 'anhelitu', 'anse', 'antoine', 'appert', 'appian', 'aquapendente', 'aquilo', 'aquit', 'archæology', 'arcola', 'ariosto', 'arlesian', 'arlesienne', 'arma', 'arstein', 'assafœtida', 'assunta', 'athalie', 'auguste', 'auteuil', 'avi', 'avrà', 'azzo', 'babuino', 'bajocco', 'baldi', 'banquo', 'baptiste', 'baptistin', 'barbare', 'barberi', 'baronne', 'barrière', 'bartolomeo', 'bartoloni', 'bassora', 'bastia', 'beaucaire', 'beauchamp', 'beaurepaire', 'beauveau', 'beauvoisin', 'became', 'bedeau', 'befallen', 'befell', 'began', 'bellegarde', 'bellevue', 'benedetto', 'bentivoglio', 'beppo', 'bertuccio', 'betak

There are lots of French words are listed. It makes sense since this novel is originally written in French and I am using an English version of it to do the analysis. Some comparalatives and superlatives also appeared, like "wider, wildest, wisest"

## Q10 Most Common Words

In [18]:
print(FreqDist(nonstop).most_common(50))

[('said', 3474), ('one', 1447), ('count', 1354), ('man', 1299), ('would', 1205), ('monte', 1131), ('cristo', 1091), ('de', 1028), ('villefort', 988), ('yes', 942), ('well', 939), ('danglars', 906), ('know', 834), ('young', 793), ('two', 766), ('madame', 760), ('morrel', 747), ('replied', 738), ('albert', 729), ('see', 727), ('dantès', 723), ('time', 719), ('like', 685), ('could', 639), ('shall', 603), ('made', 592), ('sir', 589), ('say', 586), ('valentine', 585), ('must', 575), ('father', 570), ('oh', 569), ('franz', 551), ('us', 539), ('come', 533), ('first', 531), ('door', 522), ('may', 514), ('upon', 508), ('eyes', 506), ('hand', 504), ('without', 495), ('house', 494), ('tell', 465), ('asked', 462), ('much', 461), ('day', 458), ('never', 431), ('old', 429), ('still', 422)]


The results are not very surprising. Monte Cristo (Dantès) appeared very frequently. The other characters' names Danglars, Villefort, Valentine and Morrel appeared quite frequently. "never" and "still" indicated the Monte Cristo's determination to revenge.

## Q11 Compare to Jane Austen

In [19]:
nonstop2 = [word for word in text2 if word.isalnum() and (word not in stopwords)]
print(FreqDist(nonstop2).most_common(50))

[('I', 2004), ('Elinor', 684), ('could', 568), ('Marianne', 566), ('Mrs', 530), ('would', 507), ('said', 397), ('every', 361), ('one', 304), ('But', 289), ('much', 287), ('sister', 282), ('must', 279), ('Edward', 262), ('mother', 258), ('She', 258), ('Dashwood', 252), ('The', 243), ('time', 237), ('know', 230), ('Jennings', 230), ('might', 215), ('Willoughby', 215), ('think', 209), ('Miss', 208), ('though', 204), ('He', 203), ('well', 191), ('It', 189), ('thing', 185), ('Lucy', 185), ('never', 184), ('soon', 180), ('Mr', 178), ('see', 173), ('Colonel', 173), ('without', 171), ('nothing', 170), ('ever', 169), ('may', 169), ('good', 166), ('John', 163), ('first', 160), ('say', 160), ('house', 159), ('little', 158), ('day', 150), ('great', 149), ('two', 145), ('however', 145)]


I haven't read 'Sense and Sensibility' by Jane Austen, but from the title and the most common word of the book. My guess is that this book is a romance novel, it may has a happy ending. While 'The Count of Monte Cristo' has a more gloomy theme. And I felt if I run machine learning on this two text, the authors' gender can be predicted correctly.