<table align="center">
   <td align="center"><a target="_blank" href="https://colab.research.google.com/github/umbcdata602/spring2021/blob/master/nltk.ipynb">
<img src="http://introtodeeplearning.com/images/colab/colab.png?v2.0"  style="padding-bottom:5px;" />Run in Google Colab</a></td>
</table>

# NLTK

NLP basics

# references

* Raschka's [ch08.ipynb](https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/ch08/ch08.ipynb) -- github
* Bonaccorso's [lsa.py](https://github.com/giuseppebonaccorso/Machine-Learning-Algorithms-Second-Edition/blob/master/Chapter14/lsa.py) -- github
* *Machine Learning Algorithms, 2nd Ed*, 2018, Giuseppe Bonaccorso, Packt
* [NLTK](https://www.nltk.org/) -- nltk.org
  * [installing data](https://www.nltk.org/data.html) -- nltk.org
  * [corpora](https://www.nltk.org/book/ch02.html) -- nltk.org
  * [*Natural Language Processing with Python*](https://www.nltk.org/book/) -- NLTK book
* [Brown corpus manual](http://icame.uib.no/brown/bcm.html)
    * [list of samples](http://icame.uib.no/brown/bcm-los.html)

# Bag of words

In [None]:
# Raschka Cell 9
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [None]:
# Cell 10
count.vocabulary_

{'and': 0,
 'is': 1,
 'one': 2,
 'shining': 3,
 'sun': 4,
 'sweet': 5,
 'the': 6,
 'two': 7,
 'weather': 8}

In [None]:
# Cell 11
bag.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]])

# Stemming


In [None]:
# Rachka's Cell 22
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run') 

# ['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

# Stop words

In [None]:
# Raschka's cell 25 & 26
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')

[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]
# ['runner', 'like', 'run', 'run', 'lot']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

In [None]:
# Install NLTK and the Brown corpus
from nltk.corpus import brown

nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [None]:
# Compose a corpus
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = brown.sents(categories=['news'])[0:500]
corpus = []

for s in sentences:
    corpus.append(' '.join(s))

# Vectorize the corpus
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', sublinear_tf=True, use_idf=True)
Xc = vectorizer.fit_transform(corpus).todense()

print('vectorizer.vocabulary:', vectorizer.vocabulary_)
print('sentences[0]:', sentences[0])
print('len(sentences[0]:', len(sentences[0]))
print('Xc.shape:', Xc.shape)

sentences[0]: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
len(sentences[0]: 25
Xc.shape: (500, 2404)
