In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
train = pd.read_csv("labeledTrainData.tsv",header=0,delimiter='\t',quoting=3)
# "header=0" indicates the first line of the file contains column names


In [None]:
# Data Cleaning and Text Preprocessing
from bs4 import BeautifulSoup

# Convert a raw review to a string of words
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review).get_text()
    letters_only = re.sub("[^a-zA-Z]"," ",review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stops]
    return(" ".join(meaningful_words))


In [None]:
num_reviews = train['review'].size
print("Cleaning and parsing the training set movie reviews...\n")
clean_train_reviews = []
for i in range(0,num_reviews):
    if((i+1)%5000==0):
        print("Review %d of %d\n" % (i+1,num_reviews))
    clean_train_reviews.append(review_to_words(train['review'][i]))

In [None]:
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool
vectorizer = CountVectorizer(analyzer='word',\
                             tokenizer=None,\
                             preprocessor=None,\
                             stop_words=None,\
                             max_features=5000)

# The input to fit_transform should be a list of strings
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an array
train_data_features = train_data_features.toarray()


In [None]:
train_data_features

In [None]:
import numpy as np

# Sum up the counts of each vocabulary word 
dist = np.sum(train_data_features,axis=0)

for tag,count in zip(vocab, dist):
    print(count, tag)


In [None]:
print("Training the random forest...\n")
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)

forest = forest.fit(train_data_features,train['sentiment'])

In [None]:
test = pd.read_csv('testData.tsv',header=0,delimiter='\t',quoting=3)
num_reviews = len(test['review'])
clean_test_reviews = []

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_reviews):
    if((i+1)%5000==0):
        print("Review %d of %d" % ((i+1),num_reviews))
    clean_test_reviews.append(review_to_words(test['review'][i]))

test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

result = forest.predict(test_data_features)

output = pd.DataFrame(data={"id":test['id'],"sentiment":result})

output.to_csv("Bag_of_Words_model.csv",index=False,quoting=3)