# Moby Dick NLP

- Read in text and clean it
- Make a dictionary of words that are used in the text

## Reading in and cleaning

In [39]:
filename = "C:/Users/xsoni/Desktop/PDA/MobyDick.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()

#  Looking at the text we see occurances of '\n' which indicate that there are spaces or row changes 
#  in the original text.
raw_text = raw_text.replace('\n', ' ')

# Do we want to keep '-'? Ex. should we have 'Whale-ship' or should we separate to 'Whale' and 'Ship?'
text = raw_text.replace('-', ' ')

# We do not distinguish between upper and lower-case, therefore make text entierly lower-case
text_lower = text.lower()

# Keep only letters, space and a few symbols for simplicity
symbols_to_keep = set('abcdefghijklmnopqrstuvwxyz’,.1234567890"')
text_cleaned = ''.join(filter(whitelist.__contains__, text_lower))

## Making a dictionary of words

In [45]:
words = set(text_cleaned.split())
print(len(words), " distinct words")
words_list = list(words)
words_list.sort()



18642  distinct words


['a',
 'aback',
 'abaft',
 'abandon',
 'abandoned',
 'abandonedly',
 'abandonment',
 'abased',
 'abasement',
 'abashed',
 'abate',
 'abated',
 'abatement',
 'abating',
 'abbreviate',
 'abbreviation',
 'abeam',
 'abed',
 'abednego',
 'abel',
 'abhorred',
 'abhorrence',
 'abhorrent',
 'abhorring',
 'abide',
 'abided',
 'abiding',
 'ability',
 'abjectly',
 'abjectus',
 'able',
 'ablutions',
 'aboard',
 'abode',
 'abominable',
 'abominate',
 'abominated',
 'abomination',
 'aboriginal',
 'aboriginally',
 'aboriginalness',
 'abortion',
 'abortions',
 'abound',
 'abounded',
 'abounding',
 'aboundingly',
 'about',
 'abouthowever',
 'about’',
 'above',
 'abraham',
 'abreast',
 'abridged',
 'abroad',
 'abruptly',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absorbed',
 'absorbing',
 'absorbingly',
 'abstained',
 'abstemious',
 'abstinence',
 'abstract',
 'abstracted',
 'abstraction',
 'absurd',
 'absurdly',
 'abundance',
 'abundant',
 'abundantly',
 'academy',
 'accelerate',
 'accelerate

### Problems to fix:
- Occurances of words that are concatenated EX: 'lastthe' and 'occasionsif'
- Occurances of symbols at the end or possibly the beginning of words EX: 'swam'', 'him’', 

### Things to consider:
- Should we really distinguish between 'snort' and 'snorts'? 
- Should one model understand words and their relation and another correct the grammar?

In [51]:
## Creating a dictionary of the words that occur in the text
dictionary = {}
for word in words_list:
    letter = word[0]
    if letter not in dictionary:
        dictionary[letter] = [word]
    else:
        dictionary[letter].append(word)

dictionary['z']

['zag',
 'zay',
 'zeal',
 'zealand',
 'zealanders',
 'zephyr',
 'zeuglodon',
 'zig',
 'zodiac',
 'zogranda',
 'zone',
 'zoned',
 'zones',
 'zoology',
 'zoroaster']