# Lesson 2

### How to install packages in Anaconda

https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-pkgs.html#viewing-a-list-of-installed-packages

In [None]:
conda list

https://www.nltk.org/

To install it use:

!pip install nltk

or 

conda install nltk

In [None]:
import nltk
nltk.download()

In [None]:
dir(nltk)

In [None]:
'''
nltk.download('gutenberg')
nltk.download('genesis')
nltk.download('inaugural')
nltk.download('nps_chat')
nltk.download('webtext')
nltk.download('treebank')
'''

from nltk.book import text1

In [None]:
text1

### Playing with Moby Dick

In [None]:
import nltk

import pandas as pd
import numpy as np

In [None]:
with open('moby.txt', 'r') as f:
    moby_raw = f.read()

In [None]:
type(moby_raw)

In [None]:
moby_raw[:150]

In [None]:
nltk.download('punkt')

In [None]:
# tokens

moby_tokens = nltk.word_tokenize(moby_raw)

In [None]:
type(moby_tokens)

In [None]:
moby_tokens[:150]

In [None]:
# to work with the novel in nltk.Text format 'text1' variable

text1 = nltk.Text(moby_tokens)

In [None]:
 # How many tokens (words and punctuation symbols) are in text1?

len(text1)

In [None]:
# How many unique tokens (unique words and punctuation) does text1 have?

len(set(text1))

In [None]:
# After lemmatizing ONLY the verbs, how many unique tokens does text1 have?

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w,'v') for w in text1]

len(set(lemmatized))

In [None]:
# Lexical diversity of the text = ratio of unique tokens to the total number of tokens

len(set(text1))/len(text1)


In [None]:
help(text1.vocab())

In [None]:
text1.vocab()

In [None]:
text1.vocab().items()

In [None]:
# counting a single word occurences: percentage of tokens that are 'whale'or 'Whale'?

val = (text1.vocab()['whale'] + text1.vocab()['Whale']) / len(text1)

format(val, ".3%")

In [None]:
import operator

help(operator)

In [None]:
# the 20 most frequently occurring (unique) tokens in the text? What is their frequency?

sorted(text1.vocab().items(), 
       key=operator.itemgetter(1), 
       reverse=True)[:20]

In [None]:
# What tokens have a length of greater than 5 and frequency of more than 150?

sorted( [token for token, freq in text1.vocab().items()
        if len(token) > 5 and freq > 150] )

In [None]:
# Find the longest word in text1 and that word's length

sorted([(token, len(token)) for token, freq in text1.vocab().items()], 
       key=operator.itemgetter(1), reverse=True)[0]

In [None]:
s = "string"
help(s.isalpha)

In [None]:
",".isalpha()

In [None]:
# What unique words (no punctuations!) have a frequency of more than 2000? What is their frequency?

sorted([(freq, token) for token, freq in text1.vocab().items() if freq > 2000 and token.isalpha()], 
       key=operator.itemgetter(0), reverse=True)

In [None]:
# average number of tokens per sentence

np.mean([len(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(moby_raw)])

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
# 5 most frequent parts of speech in this text

from collections import Counter
import operator

sorted(Counter([tag for token, tag in nltk.pos_tag(text1)]).items(), 
       key=operator.itemgetter(1), reverse=True)[:5]

In [None]:
### Preprocessing

In [None]:
# Our example: first sentence from the book Pride and Prejudice as the text

text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."

In [None]:
# Lowercase

text = text.lower()
print(text)

In [None]:
# Removing Punctuation

import string
print(string.punctuation)

text_p = "".join([char for char in text if char not in string.punctuation])
print(text_p)

In [None]:
# Tokenization

from nltk import word_tokenize

words = word_tokenize(text_p)
print(words)

In [None]:
# Stopword Filtering

## we can use nltk.corpus.stopwords.words('english') to fetch a list of stopwords in the English dictionary

nltk.download('stopwords')
from nltk.corpus import stopwords

## let's see them
stop_words = stopwords.words('english')
print(stop_words)

In [None]:

filtered_words = [word for word in words if word not in stop_words]
print(filtered_words)

In [None]:
# Stemming

## we stem the tokens using nltk.stem.porter.PorterStemmer to get the stemmed tokens

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [None]:

stemmed = [porter.stem(word) for word in filtered_words]
print(stemmed)

In [None]:
# POS (part-of-speech) Tagger

## we can use nltk.pos_tag to retrieve the part of speech of each token in a list
from nltk import pos_tag

In [None]:

pos = pos_tag(filtered_words)
print(pos)

In [None]:
#-# FULL PREPROCESSING
# We can combine all the preprocessing methods above and create a preprocess function that takes in a .txt file and handles all the preprocessing

import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag

def preprocess(filename):
    f = open(filename,'r')
    text = f.read()
    text = text.lower()
    
    text_p = "".join([char for char in text if char not in string.punctuation])
    
    words = word_tokenize(text_p)
    
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in filtered_words]
    
    pos = pos_tag(filtered_words)
    
    return words, filtered_words, stemmed, pos


In [None]:
words, filtered_words, stemmed, pos = preprocess('pride_and_prejudice.txt')

In [None]:
print('Words:', words[:50])

In [None]:
print('Filtered words:', filtered_words[:50])

In [None]:
print('Stemmed words:', stemmed[:50])

In [None]:
print('Part of Speech:', pos[:50])

### n-grams

In [None]:
from nltk.util import bigrams

myList = [1,2,3,4,5]

list(bigrams(myList))

In [None]:
from nltk.util import trigrams

print(trigrams(myList))

list(trigrams(myList))

In [None]:
from nltk.util import everygrams

sent = 'a b c'.split()

list(everygrams(sent))

### the end