In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import pickle


train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) # type: pandas.Dataframe
STOP_WORDS = stopwords.words('english')

with open('pickle/clean_train_review', 'r') as f:
		clean_train_review = pickle.load(f)

def review_to_words( raw_review ):
	"""
	Process a raw review into a string of words
	:param raw_review:
	:return:
	:type   raw_review: str
	:rtype  str
	"""
	# Initialize the BeautifulSoup object on a single movie review
	review_text = BeautifulSoup(raw_review, 'lxml').get_text()
	
	# Use re to filter non-alphabetic letter
	letters_only = re.sub('[^A-Za-z]', ' ', review_text)
	words = letters_only.lower().split()
	
	# Remove stop words from 'words'
	meaningful_words = [w for w in words if w not in STOP_WORDS]
	
	return (' '.join(meaningful_words))

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
# Init 'CounterVectorizer' object
vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
train_data_features = vectorizer.fit_transform(clean_train_review)
train_data_features = train_data_features.toarray()

In [14]:
print train_data_features.shape
vocab = vectorizer.get_feature_names()

(25000, 5000)


In [15]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set

vocab_dist = zip(vocab, dist)
# Sort by descending dist
# vocab_dist.sort(key = lambda x: x[1], reverse= True)

In [16]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
forest = forest.fit(train_data_features, train["sentiment"] )

Training the random forest...


In [None]:
# Read the test data
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )
# Verify that there are 25,000 rows and 2 columns
print test.shape

# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])
clean_test_reviews = [] 

print "Cleaning and parsing the test set movie reviews...\n"
for i in xrange(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print "Review %d of %d\n" % (i+1, num_reviews)
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

In [41]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
# Use pandas to write the comma-separated output file
output.to_csv("\pickle\Bag_of_Words_model.csv", index=False, quoting=3)