In [1]:
test = "In the ninja world, those who break the rules are trash. That's true, but those who abandon their friends are worse than trash."

In [2]:
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize   # sent_tokenize to tokenize sentences 


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('state_union') 
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('movie_reviews')

In [3]:
words = word_tokenize(test.lower())
words

['in',
 'the',
 'ninja',
 'world',
 ',',
 'those',
 'who',
 'break',
 'the',
 'rules',
 'are',
 'trash',
 '.',
 'that',
 "'s",
 'true',
 ',',
 'but',
 'those',
 'who',
 'abandon',
 'their',
 'friends',
 'are',
 'worse',
 'than',
 'trash',
 '.']

In [4]:
from nltk.corpus import stopwords

In [5]:
stop = stopwords.words('english')

In [6]:
# stop

In [7]:
import string
punctuations = list(string.punctuation)

In [8]:
stop = stop + punctuations

In [9]:
clean_words = [w for w in words if not w in stop]

In [10]:
clean_words


['ninja',
 'world',
 'break',
 'rules',
 'trash',
 "'s",
 'true',
 'abandon',
 'friends',
 'worse',
 'trash']

### Lecture 3 Stemming

In [11]:
from nltk.stem import PorterStemmer

In [12]:
stem_words = ["play", "playing", "player", "played"]
ps = PorterStemmer()
stemmed_words = [ps.stem(w) for w in stem_words]
stemmed_words

['play', 'play', 'player', 'play']

### Lecture 4 Part of Speech

In [13]:
from nltk import pos_tag
from nltk.corpus import state_union

In [14]:
text = state_union.raw('2006-GWBush.txt')

In [15]:
# text

In [16]:
pos = pos_tag(word_tokenize(text))

In [17]:
# pos

### Lecture 5 Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer

In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
lemmatizer.lemmatize("better", pos = "a")

'good'

In [21]:
lemmatizer.lemmatize("excellent", pos = "n")

'excellent'

In [22]:
lemmatizer.lemmatize("painting", pos = "v")

'paint'

In [23]:
lemmatizer.lemmatize("painting", pos = "n")

'painting'

In [24]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Movie Reviews Project

In [25]:
from nltk.corpus import movie_reviews

In [26]:
movie_reviews.categories()

['neg', 'pos']

In [27]:
# movie_reviews.fileids()

In [28]:
# movie_reviews.fileids('neg')

In [29]:
movie_reviews.words(movie_reviews.fileids()[6])

['so', 'ask', 'yourself', 'what', '"', '8mm', '"', '(', ...]

### Movie Review Cleaning

In [30]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
    
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [31]:
len(documents)

2000

In [32]:
import random
random.shuffle(documents)
documents[0:5]

[(['say', ',', 'tell', 'me', 'if', 'you', "'", 've', ...], 'neg'),
 (['notting', 'hill', "'", 's', 'trailer', 'is', 'awful', ...], 'pos'),
 (['when', 'i', 'first', 'heard', 'about', 'scream', ...], 'pos'),
 (['phew', ',', 'what', 'a', 'mess', '!', 'for', 'his', ...], 'neg'),
 (['the', 'small', '-', 'scale', 'film', ',', 'in', ...], 'pos')]

In [33]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [34]:
from nltk.corpus import wordnet

In [35]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w]) #pos tag expects an array, so array is passed
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [36]:
documents = [(clean_review(document), category) for document, category in documents]

In [37]:
documents[0]

(['say',
  'tell',
  'see',
  'crisis',
  'board',
  'commercial',
  'airliner',
  'cause',
  'stewardess',
  'fly',
  'land',
  'plane',
  'airport',
  '97',
  'anyone',
  'ray',
  'liotta',
  'psychotic',
  'serial',
  'killer',
  'transport',
  'new',
  'york',
  'california',
  'christmas',
  'eve',
  'amazingly',
  'would',
  'seemingly',
  'busy',
  'day',
  'travel',
  'one',
  'flown',
  'route',
  'six',
  'passenger',
  'flight',
  'anyway',
  'take',
  'liotta',
  'escape',
  'kill',
  'police',
  'pilot',
  'stewardess',
  'lauren',
  'holly',
  'lock',
  'cockpit',
  'fly',
  'plane',
  'story',
  'beyond',
  'routine',
  'script',
  'embarrass',
  'one',
  'point',
  'jumbo',
  'jet',
  'fly',
  'completely',
  'upside',
  'character',
  'worthless',
  'performance',
  'annoy',
  'surprisingly',
  'co',
  'writer',
  'steven',
  'e',
  'de',
  'souza',
  'actually',
  'write',
  'first',
  'two',
  'die',
  'hard',
  'movie',
  'turbulence',
  'take',
  'place',
  'christ