In [20]:
import nltk
import random
from nltk.corpus import movie_reviews

## the data structure

In [21]:
print(movie_reviews.fileids()[:10])

['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


In [22]:
print (len(movie_reviews.fileids()))

2000


In [24]:
print(len(movie_reviews.raw('neg/cv000_29416.txt')))

4043


In [26]:
print(movie_reviews.raw('neg/cv000_29416.txt')[:100])

plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 



In [27]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
 
stop_words = stopwords.words('english') + list(punctuation)
 
def tokenize(text):
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    return [w for w in words if w not in stop_words and not w.isdigit()]

In [28]:
tokenize("today is Friday. The weather is sunny.") # will remove the stop words and punctuations

['today', 'friday', 'weather', 'sunny']

In [29]:
# build the vocabulary in one pass
vocabulary = set()
for file_id in movie_reviews.fileids():
    words = tokenize(movie_reviews.raw(file_id))
    vocabulary.update(words)
    
vocabulary = list(vocabulary)

In [31]:
print(len(vocabulary))

46215


In [32]:
print(vocabulary[:10])

['creepy', 'unjustly', 'belly-button', '3-year', 'amplified', 'fringe', 'elfont', 'impressive-as-ever', 'argentinian', 'firebird']


## calculate the tf-idf

In [50]:
word_index = {w: idx for idx, w in enumerate(vocabulary)}
print(word_index['creepy'])
print(word_index['unjustly'])
print(word_index['belly-button'])

0
1
2


In [51]:
from collections import defaultdict
word_idf = defaultdict(lambda: 0)

In [52]:
for file_id in movie_reviews.fileids():
    words = set(tokenize(movie_reviews.raw(file_id)))
    for word in words:
        word_idf[word] += 1
word_idf["creepy"] #creepy has appears 147 times

77

In [53]:
import math
VOCABULARY_SIZE = len(vocabulary)
DOCUMENTS_COUNT = len(movie_reviews.fileids())
 
print(VOCABULARY_SIZE, DOCUMENTS_COUNT)    # 46215 2000

for word in vocabulary:
    word_idf[word] = math.log(DOCUMENTS_COUNT / float(1 + word_idf[word]))
 
print (word_idf['movie'])    # 8.095775128229402
print (word_idf['creepy'])   # 6.318944087481682
 

46215 2000
0.2600669054188076
3.2441936328524905


In [None]:
def word_tf(word, document):
    if isinstance(document, str):
        document = tokenize(document) 
    return float(document.count(word)) / len(document)
 
def tf_idf(word, document):
    # If not tokenized
    if isinstance(document, str):
        document = tokenize(document) 
    if word not in word_index:
        return .0 
    return word_tf(word, document) * word_idf[word]

print(tf_idf("accident", movie_reviews.raw('neg/cv000_29416.txt')))

## get tf-idf by sklearn

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
 
tfidf = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, vocabulary=vocabulary)
 
# Fit the TfIdf model
tfidf.fit([movie_reviews.raw(file_id) for file_id in movie_reviews.fileids()])
 
# Transform a document into TfIdf coordinates
X = tfidf.transform([movie_reviews.raw('neg/cv000_29416.txt')])

0.0495250118674


In [None]:
# Check out some frequencies
print (X[0, tfidf.vocabulary_['accident']])

## preprocessing data

In [104]:
doc, label = [], []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        doc.append(movie_reviews.raw(fileid))
        label.append(category)

In [105]:
for i in range(10):
    print(doc[i][:10], label[i])

plot : two neg
the happy  neg
it is movi neg
 " quest f neg
synopsis : neg
capsule :  neg
so ask you neg
that's exa neg
call it a  neg
plot : a y neg


In [106]:
#split data to training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(doc, label, test_size=0.25, random_state=42)

## build model and test

In [107]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1500, 35393)

In [119]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [109]:
docs_new = ['good excellent famous', 'very bad poor creepy ']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

'good excellent famous' => pos
'very bad poor creepy ' => neg


In [118]:
print(len(X_test))
print(type(X_test[0]))

500
<class 'str'>


In [None]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [6]:
print(type(documents[0]))
print(documents[1])
print(len(documents))

<class 'tuple'>
(['no', 'film', 'in', 'recent', 'has', 'left', 'me', 'with', 'such', 'conflicted', 'feelings', 'as', 'neil', 'jordan', "'", 's', 'harrowing', ',', 'humorous', ',', 'horrifying', 'adaptation', 'of', 'patrick', 'mccabe', "'", 's', 'novel', 'about', 'young', 'lad', 'francie', 'brady', "'", 's', '(', 'eamonn', 'owens', ')', 'descent', 'into', 'madness', 'in', '1960s', 'ireland', '.', 'on', 'one', 'hand', ',', 'it', 'was', 'difficult', 'for', 'me', 'to', 'become', 'invested', 'in', 'francie', "'", 's', 'story', 'because', 'he', 'is', 'such', 'an', 'unsavory', 'character', ',', 'unjustifyably', 'venting', 'his', 'rage', 'at', 'his', 'nosy', 'but', 'otherwise', 'harmless', 'neighbor', 'mrs', '.', 'nugent', '(', 'fiona', 'shaw', ')', '.', 'on', 'another', 'hand', ',', 'i', 'found', 'it', 'difficult', 'to', 'laugh', 'at', 'some', 'of', 'francie', "'", 's', 'darkly', 'comic', 'shenanigans', 'because', 'he', 'obviously', 'is', 'such', 'a', 'sick', ',', 'needy', 'child', ',', 'havi

In [7]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15)) #the most frequent words and their appearing times
print(all_words["stupid"]) #the number of the word "stupid" appears

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
253
