In [None]:
import pytorch

# Traditional NLP

- Part of AI, intersection of computer science, artificial intelligence, and linguistics
- Concerns with the interactions between computer and natural language (human language)
- Understand and process human languages
- Deal with unstructured textual data: text, video, images

## Tasks
- Text classification
- Sentiment analysis
- Named entity recognition
- Machine translate
- Text summary

## NLP Operations
- Tokenization: splitting text into individual words or sentence
- Stopword removal: removing common words that do not contain significant meaning
- Stemming and lemmatization: reducing words to their base or root forms
- Part-of-speech (PoS) tagging: assigning parts of speech to each word (noun, verb, etc.)
- Named Entity Recognition (NER): identify adn classify named entities (name, dates)
- Bag-of-words (BoW): representing text as a collection of words frequencies
- TF-IDF (term frequency-inverse document frequency): a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents

### TF-IDF
- Term frequency: how important is the word in the document?
    - Term frequency is the number of occurances of a given word in document
- Inverse document frequency: how important is that term in the whole corpus?
    - log(number of documents in the corpus/number of documents that include that word)

# NLP Libraries in Python
- NLTK (Natural Language Toolkit): a comprehensive library for various NLP tasks
- spaCy: an industrial-strength NLP library with efficient and easy-to-use features
- TextBlob: a simple library for processing textual data
- Scikit-learn: text classification and other NLP tasks
- Transformers (by Hugging Face): state-of-the-art NLP with transformer models

# NLTK Exercise

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger') # pretrained parts of speech for English
nltk.download('punkt') # pretrained english tokenizer
nltk.download('stopwords') # list of 179 English stopwords
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/leotyler/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /Users/leotyler/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leotyler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leotyler/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# tokenization -- paragraph to sentences:
from nltk.tokenize import sent_tokenize
text = "Hello everyone, this lecture is about natural language processing. It is week 6 already"
results = sent_tokenize(text)
print(results)

['Hello everyone, this lecture is about natural language processing.', 'It is week 6 already']


In [5]:
len(results)

2

In [6]:
type(results)

list

In [7]:
results = word_tokenize(text)
print(results)

['Hello', 'everyone', ',', 'this', 'lecture', 'is', 'about', 'natural', 'language', 'processing', '.', 'It', 'is', 'week', '6', 'already']


In [8]:
# Porter stemmer

from nltk.stem import PorterStemmer
ps = PorterStemmer()
words = results
for w in words:
    print(w, ":", ps.stem(w))

Hello : hello
everyone : everyon
, : ,
this : thi
lecture : lectur
is : is
about : about
natural : natur
language : languag
processing : process
. : .
It : it
is : is
week : week
6 : 6
already : alreadi


In [None]:
# Lancaster Stemmer and Snowball Stemmer

In [9]:
# Wordnet Lemmatization
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
words = word_tokenize(text)
for w in words:
    print(w, ":", lemma.lemmatize(w))

Hello : Hello
everyone : everyone
, : ,
this : this
lecture : lecture
is : is
about : about
natural : natural
language : language
processing : processing
. : .
It : It
is : is
week : week
6 : 6
already : already


In [10]:
# Removing stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)

filtered_sentence = []
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
print(word_tokens)
print(filtered_sentence)

['Hello', 'everyone', ',', 'this', 'lecture', 'is', 'about', 'natural', 'language', 'processing', '.', 'It', 'is', 'week', '6', 'already']
['Hello', 'everyone', ',', 'lecture', 'natural', 'language', 'processing', '.', 'It', 'week', '6', 'already']


In [12]:
# Part of speech tagging
# Refer to NLTK PoS tags for clarification
pos_tags = pos_tag(filtered_sentence)

print('Original text: ')
print(filtered_sentence)
print('\n PoS tagging results: ')

for word, pos_tag in pos_tags:
    print(f"{word} : {pos_tag}")

Original text: 
['Hello', 'everyone', ',', 'lecture', 'natural', 'language', 'processing', '.', 'It', 'week', '6', 'already']

 PoS tagging results: 
Hello : NNP
everyone : NN
, : ,
lecture : JJ
natural : JJ
language : NN
processing : NN
. : .
It : PRP
week : NN
6 : CD
already : RB


In [14]:
# Named entity recognition

nltk.download('maxent_ne_chunker')
nltk.download('words')
text = "Hello all, Jameson is giving a NLP lecture today. We will meet in Ford 201 this Friday evening from 6-10."
token = word_tokenize(text)
postag = nltk.pos_tag(token)
ner = nltk.ne_chunk(postag, binary = False)
print(ner)

(S
  (GPE Hello/NNP)
  all/DT
  ,/,
  (PERSON Jameson/NNP)
  is/VBZ
  giving/VBG
  a/DT
  (ORGANIZATION NLP/NNP)
  lecture/NN
  today/NN
  ./.
  We/PRP
  will/MD
  meet/VB
  in/IN
  Ford/NNP
  201/CD
  this/DT
  Friday/NNP
  evening/NN
  from/IN
  6-10/JJ
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/leotyler/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/leotyler/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [19]:
# Sentiment Analysis
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
s2 = "That's disgusting"
print('polarity score for s2: ')
sia.polarity_scores(s2)


polarity score for s2: 


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/leotyler/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'neg': 0.773, 'neu': 0.227, 'pos': 0.0, 'compound': -0.5267}