# Finding Unusual Words in Given Language

In [1]:
text = """
It’s odd, but in the infrequent occasions when I’ve been called upon in a formal place to play the bongo drums, 
the introducer never seems to find it necessary to mention that I also do theoretical physics. 
I believe that’s probably that we respect the arts more than the sciences. T
he artists of the Renaissance said that man’s main concern should be for man. 
And yet there are some other things of interest in the world: even the artist appreciates sunsets, 
and the ocean waves, and the march of the stars across the heavens. 
And there is some reason, then, to talk of other things sometimes."""

## 1. Tokenizing text

In [2]:
from nltk import word_tokenize
text_tokenized = word_tokenize(text.lower())
text_tokenized

['it',
 '’',
 's',
 'odd',
 ',',
 'but',
 'in',
 'the',
 'infrequent',
 'occasions',
 'when',
 'i',
 '’',
 've',
 'been',
 'called',
 'upon',
 'in',
 'a',
 'formal',
 'place',
 'to',
 'play',
 'the',
 'bongo',
 'drums',
 ',',
 'the',
 'introducer',
 'never',
 'seems',
 'to',
 'find',
 'it',
 'necessary',
 'to',
 'mention',
 'that',
 'i',
 'also',
 'do',
 'theoretical',
 'physics',
 '.',
 'i',
 'believe',
 'that',
 '’',
 's',
 'probably',
 'that',
 'we',
 'respect',
 'the',
 'arts',
 'more',
 'than',
 'the',
 'sciences',
 '.',
 't',
 'he',
 'artists',
 'of',
 'the',
 'renaissance',
 'said',
 'that',
 'man',
 '’',
 's',
 'main',
 'concern',
 'should',
 'be',
 'for',
 'man',
 '.',
 'and',
 'yet',
 'there',
 'are',
 'some',
 'other',
 'things',
 'of',
 'interest',
 'in',
 'the',
 'world',
 ':',
 'even',
 'the',
 'artist',
 'appreciates',
 'sunsets',
 ',',
 'and',
 'the',
 'ocean',
 'waves',
 ',',
 'and',
 'the',
 'march',
 'of',
 'the',
 'stars',
 'across',
 'the',
 'heavens',
 '.',
 'and'

## 2. Importing and exploring the words corpus

In [3]:
from nltk.corpus import words
words.readme().replace('\n', ' ')

'Wordlists  en: English, http://en.wikipedia.org/wiki/Words_(Unix) en-basic: 850 English words: C.K. Ogden in The ABC of Basic English (1932) '

In [4]:
words

<WordListCorpusReader in '/home/t0m/nltk_data/corpora/words'>

In [5]:
words.fileids()

['en', 'en-basic']

In [6]:
words.words('en')[:10]

['A',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron']

In [7]:
words.words('en-basic')[:10]

['I',
 'a',
 'able',
 'about',
 'account',
 'acid',
 'across',
 'act',
 'addition',
 'adjustment']

In [8]:
len(words.words('en'))

235886

In [9]:
len(words.words('en-basic'))

850

## 3. Finding unusual words

In [10]:
english_vocab = set(w.lower() for w in words.words())
# Note .isalpha() removes punctuation tokens. However, 
# tokens with a hyphen like 'browser-based' are totally skipped over because .isalpha() would be false.
text_vocab = set(w.lower() for w in text_tokenized if w.isalpha()) 
unusual = text_vocab.difference(english_vocab)
unusual

{'appreciates',
 'artists',
 'arts',
 'called',
 'drums',
 'occasions',
 'sciences',
 'seems',
 'stars',
 'sunsets',
 'things',
 've',
 'waves'}