Refer to [Text Classification](https://machinelearningmastery.com/prepare-movie-review-data-sentiment-analysis/)

## Bag-of-words

In [42]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# transfer all words to lowercase
	#tokens = [w.lower() for w in tokens]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens
 
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)

# save list to file
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('generate/neg2000', vocab)
process_docs('generate/pos2000', vocab)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))
# keep tokens with > 5 occurrence
min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))
# save tokens to a vocabulary file
save_list(tokens, 'generate/vocab.txt')

48293
[('br', 9296), ('The', 7232), ('movie', 6491), ('film', 5629), ('one', 3760), ('like', 3003), ('This', 2354), ('good', 2232), ('It', 1899), ('would', 1877), ('story', 1874), ('time', 1748), ('really', 1743), ('see', 1723), ('even', 1645), ('much', 1464), ('get', 1435), ('great', 1385), ('well', 1289), ('bad', 1282), ('first', 1270), ('made', 1261), ('people', 1249), ('could', 1249), ('also', 1237), ('movies', 1232), ('films', 1211), ('character', 1194), ('way', 1169), ('But', 1155), ('characters', 1145), ('make', 1130), ('dont', 1130), ('think', 1126), ('And', 1124), ('Its', 1084), ('seen', 1065), ('many', 1002), ('love', 998), ('watch', 989), ('two', 987), ('life', 979), ('In', 975), ('never', 956), ('little', 928), ('plot', 925), ('show', 921), ('know', 913), ('best', 906), ('acting', 901)]
11685


In [71]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)
 
# load all docs in a directory
def process_docs(directory, vocab):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		# add to list
		lines.append(line)
	return lines

# load vocabulary
vocab_filename = 'generate/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
# prepare negative reviews
negative_lines = process_docs('generate/neg2000', vocab)
save_list(negative_lines, 'generate/negative.txt')
# prepare positive reviews
positive_lines = process_docs('generate/pos2000', vocab)
save_list(positive_lines, 'generate/positive.txt')



In [77]:
# load reviews
positive_lines = load_doc('generate/positive.txt')
positive_lines = positive_lines.split('\n')
negative_lines = load_doc('generate/negative.txt')
negative_lines = negative_lines.split('\n')



In [106]:
import numpy as np

def generate_bow(sentence, vocab):
    tokens = clean_doc(sentence)
    bag_vector = np.zeros(len(vocab))        
    for w in tokens:
        for i,word in enumerate(vocab):
            if word == w:
                bag_vector[i] += 1 
    return list(bag_vector)

def generate_data(lines, vocab):
    X = list()
    for sentence in lines:
        X.append(generate_bow(sentence, vocab))
    return np.array(X)

# # generate vector of bag of word for a sentence
# x = generate_bow([" feelings  feelings make "], vocab)
# # test if we get right vector for a sentence
# x[vocab.index("feelings")]
X_neg = generate_data(negative_lines, vocab)
X_pos = generate_data(positive_lines, vocab)
X = np.concatenate((X_neg, X_pos), axis=0)
print(X_neg.shape)
print(X_pos.shape)
print(X.shape)

(2001, 11685)
(2001, 11685)
(4002, 11685)


In [111]:
y_neg = np.full((2001, ), -1)
y_pos = np.full((2001, ), 1)
y = np.concatenate((y_neg, y_pos), axis=0)
y.shape

(4002,)

# Train model

In [120]:
# CountVectorizer implements both tokenization and occurrence counting in a single class:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# vectorizer = CountVectorizer()
# def get_features(review):    
#     return vectorizer.transform([review])

# Instantiate logistic regression and train
lr = LogisticRegression()
lr.fit(X, y)

# Predict sentiment for a glowing review
review1 = "LOVED IT! This movie was amazing. Top 10 this year."
review1_features = np.array(generate_bow(review1, vocab))
review1_features = review1_features.reshape(1,-1)
print(review1_features.shape)
print("Review:", review1)
print("Probability of positive review:", lr.predict_proba(review1_features)[0,1])


# Get the indices of the sorted cofficients
inds_ascending = np.argsort(lr.coef_.flatten()) 
inds_descending = inds_ascending[::-1]

# Print the most positive words
print("Most positive words: ", end="")
for i in range(5):
    print(vocab[inds_descending[i]], end=", ")
print("\n")

# Print most negative words
print("Most negative words: ", end="")
for i in range(5):
    print(vocab[inds_ascending[i]], end=", ")
print("\n")



(1, 11685)
Review: LOVED IT! This movie was amazing. Top 10 this year.
Probability of positive review: 0.6935863398866297
Most positive words: superb, best, favorite, loved, perfect, 

Most negative words: worst, awful, bad, boring, waste, 

