### Loading and Cleaning Reviews

In [3]:
from nltk.corpus import stopwords
import string

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load the document
filename = 'txt_sentoken/pos/cv000_29590.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

## Define a Vocabulary

In [4]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('cv9'):
			continue
		if not is_trian and not filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)

# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('txt_sentoken/neg', vocab, True)
process_docs('txt_sentoken/pos', vocab, True)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('could', 1248), ('bad', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


## Setting the Minimum Occurances

In [5]:
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

25767


setting the minimum occurances dropping the token count by half

#### Save the Vocab to a file

In [7]:
# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()

# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

In [8]:
! cat vocab.txt

wonderland
rather
sugary
romance
film
subtle
ton
bricks
falling
see
plot
developing
mile
away
lured
benign
story
single
girl
looking
soul
mate
boston
city
depicted
singles
star
oriented
carries
erin
hope
davis
dating
reasons
fate
luck
whatnot
successful
match
putting
energy
fulfilling
careers
nothing
new
type
done
often
better
many
times
contemporary
films
fictionalized
documentary
beds
sense
urgency
main
quirk
hopes
pushy
mother
visits
sees
daughters
livein
relationship
radical
hoffman
breakup
decides
put
personal
ad
newspaper
harvard
medical
school
dropout
daughter
works
nurse
without
knowledge
cornball
sitcom
stuff
least
handled
best
could
actors
director
easy
like
bright
caring
blonde
equally
care
keeps
missing
contact
financially
strapped
ruggedly
handsome
intelligent
alan
longer
wants
work
father
plumber
volunteer
aquarium
attending
college
marine
biologist
cou

gates
closed
saints
awakens
coffin
graveyard
rescued
howler
bust
axe
grisly
fulcis
unspeakable
quotable
frivolous
salvageable
ripped
mattered
stylist
idiosyncrasies
longtime
collaborator
erratic
patchy
veers
gothic
cemetery
widow
georgia
authorities
solving
psychic
involuntary
bouts
supernatural
physically
envisions
arrest
beater
swamp
conservative
backwoods
testimony
redneck
donnie
barksdale
relentless
yankee
dislikes
raimi
photocopied
motive
glorified
cuban
luis
bargained
bride
merit
pursues
realise
nether
regions
embraced
trashiness
indulged
cristofer
mistaken
morass
crosses
incite
yawn
excerpt
allure
portraying
steamy
sensuous
paradise
jane
nebbish
unwise
denouement
reunite
kewl
solely
bitchiness
greenlight
mentioned
walker
seth
stanley
tucci
engage
utilizes
seethrough
savior
offended
exaggeration
upstages
roof
relax
believes
eh
dammit
btw
pounds
lameass
cmon
crud
notting
sa

cogliostro
flatulating
violator
mcfarlane
spiderman
mcfarlanes
defining
utilize
illustration
fueled
blowout
utilizing
reflecting
pyrotechnics
wallpaper
journeyed
antidote
duplicate
unwisely
favored
jai
ordinarily
violators
contrasted
masks
heartstrings
wynn
mucked
conspire
mtvstyle
inyourface
announce
goofily
fused
dominate
compliments
filter
pomp
overwhelming
trivial
speakers
collision
raison
screwball
spencer
tracy
katharine
shyer
arouse
collides
spreads
navigator
everton
squeaky
julio
mechoso
richie
augustus
eliminated
lifeforms
innovation
plagiarized
scifihorror
trimming
sentient
electricity
sealed
rips
rarest
gifts
irreverent
refused
abandon
jakob
contradiction
haim
curfew
triumphant
heil
hitler
jakobs
liberation
volatile
punishable
informants
frightens
existing
crippled
unenthusiastic
hopeless
genres
wincott
rochelle
polito
mpaa
shouldve
zagged
commenting
punisher
tending
upstairs

actresss
resurfaces
downside
worsened
rhys
ifans
mckee
pairings
footloose
villian
innuendos
malt
billington
bulworths
luther
weeping
characterizing
lobbyists
raps
suitandtie
overtakes
everpresent
pikser
rebirth
soderberghs
giacomo
kafkaesque
stings
occupied
suprising
oskar
schindler
dissapointed
laconic
strides
prescence
vacances
hulot
hulots
tati
fourstar
uhf
funnel
conan
librarian
flamboyantly
satirizes
winsome
highenergy
puppydog
emo
gedde
watanabe
mustsee
bogart
solaris
hoblits
crossover
hobbes
noblest
pennsylvania
azazel
embeth
davidtz
hoblit
architecture
audacious
dolores
claiborne
mackey
strathairn
awardcalibre
hitandmiss
numbingly
permeating
presentday
nova
turnoff
brenneman
merge
stillers
sauna
melancholy
uncut
cartman
terrance
cuss
saddam
fang
wang
monthlong
chao
hy
beijing
yee
compares
befriend
freedoms
joblos
factual
militants
guaspari
streep
robertas
precarious
niche
f

### Train Embedding Layer

In [9]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [10]:
# turn a doc into clean tokens
def clean_doc(doc, vocab):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# filter out tokens not in vocab
	tokens = [w for w in tokens if w in vocab]
	tokens = ' '.join(tokens)
	return tokens

In [11]:
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	documents = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip any reviews in the test set
		if is_trian and filename.startswith('cv9'):
			continue
		if not is_trian and not filename.startswith('cv9'):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load the doc
		doc = load_doc(path)
		# clean doc
		tokens = clean_doc(doc, vocab)
		# add to list
		documents.append(tokens)
	return documents

# load all training reviews
positive_docs = process_docs('txt_sentoken/pos', vocab, True)
negative_docs = process_docs('txt_sentoken/neg', vocab, True)
train_docs = negative_docs + positive_docs

#### Encode the Sequence with same padding

In [14]:
from keras.preprocessing.text import Tokenizer
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

In [16]:
from keras.preprocessing.sequence import pad_sequences
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

**Define Training Labels**

In [18]:
from numpy import array
# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

**Load All Test Reviews**

In [19]:
# load all test reviews
positive_docs = process_docs('txt_sentoken/pos', vocab, False)
negative_docs = process_docs('txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

In [20]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

#### Model

In [23]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1310, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 655, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20960)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                209610    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 2,812,053
Trainable params: 2,812,053
Non-trainable params: 0
_________________________________________________________________


In [24]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

Epoch 1/10
 - 14s - loss: 0.6894 - acc: 0.5067
Epoch 2/10
 - 12s - loss: 0.6259 - acc: 0.5728
Epoch 3/10
 - 11s - loss: 0.4515 - acc: 0.8372
Epoch 4/10
 - 14s - loss: 0.3329 - acc: 0.9722
Epoch 5/10
 - 13s - loss: 0.2995 - acc: 0.9922
Epoch 6/10
 - 12s - loss: 0.2810 - acc: 0.9978
Epoch 7/10
 - 13s - loss: 0.2669 - acc: 0.9972
Epoch 8/10
 - 13s - loss: 0.2543 - acc: 0.9978
Epoch 9/10
 - 13s - loss: 0.2431 - acc: 0.9978
Epoch 10/10
 - 12s - loss: 0.2328 - acc: 0.9978


<keras.callbacks.History at 0x7f291b317c50>

In [25]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 83.000000


In [30]:
learned_embeddings = model.layers[0].get_weights()[0]

In [41]:
len(learned_embeddings)

25768

In [45]:
singular_learned = []

from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(learned_embeddings.T)
singular_learned.append(pca.singular_values_)

In [46]:
pca.singular_values_

array([28.848602], dtype=float32)